Blame - io_uring/io_uring.c - kernel/common

blob: 43ccd6ca83026b525454a14fe3ae162e6abe0199 [file] [log] [blame]

Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Shared application/kernel submission and completion ring pairs, for
				4	* supporting fast/efficient IO.
				5	*
				6	* A note on the read/write ordering memory barriers that are matched between
				7	* the application and kernel side.
				8	*
				9	* After the application reads the CQ ring tail, it must use an
				10	* appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
				11	* before writing the tail (using smp_load_acquire to read the tail will
				12	* do). It also needs a smp_mb() before updating CQ head (ordering the
				13	* entry load(s) with the head store), pairing with an implicit barrier
				14	* through a control-dependency in io_get_cqe (smp_store_release to
				15	* store head will do). Failure to do so could lead to reading invalid
				16	* CQ entries.
				17	*
				18	* Likewise, the application must use an appropriate smp_wmb() before
				19	* writing the SQ tail (ordering SQ entry stores with the tail store),
				20	* which pairs with smp_load_acquire in io_get_sqring (smp_store_release
				21	* to store the tail will do). And it needs a barrier ordering the SQ
				22	* head load before writing new SQ entries (smp_load_acquire to read
				23	* head will do).
				24	*
				25	* When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
				26	* needs to check the SQ flags for IORING_SQ_NEED_WAKEUP after
				27	* updating the SQ tail; a full memory barrier smp_mb() is needed
				28	* between.
				29	*
				30	* Also see the examples in the liburing library:
				31	*
				32	* git://git.kernel.dk/liburing
				33	*
				34	* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
				35	* from data shared between the kernel and application. This is done both
				36	* for ordering purposes, but also to ensure that once a value is loaded from
				37	* data that the application could potentially modify, it remains stable.
				38	*
				39	* Copyright (C) 2018-2019 Jens Axboe
				40	* Copyright (c) 2018-2019 Christoph Hellwig
				41	*/
				42	#include <linux/kernel.h>
				43	#include <linux/init.h>
				44	#include <linux/errno.h>
				45	#include <linux/syscalls.h>
				46	#include <linux/compat.h>
				47	#include <net/compat.h>
				48	#include <linux/refcount.h>
				49	#include <linux/uio.h>
				50	#include <linux/bits.h>
				51
				52	#include <linux/sched/signal.h>
				53	#include <linux/fs.h>
				54	#include <linux/file.h>
				55	#include <linux/fdtable.h>
				56	#include <linux/mm.h>
				57	#include <linux/mman.h>
				58	#include <linux/percpu.h>
				59	#include <linux/slab.h>
				60	#include <linux/blkdev.h>
				61	#include <linux/bvec.h>
				62	#include <linux/net.h>
				63	#include <net/sock.h>
				64	#include <net/af_unix.h>
				65	#include <net/scm.h>
				66	#include <linux/anon_inodes.h>
				67	#include <linux/sched/mm.h>
				68	#include <linux/uaccess.h>
				69	#include <linux/nospec.h>
				70	#include <linux/sizes.h>
				71	#include <linux/hugetlb.h>
				72	#include <linux/highmem.h>
				73	#include <linux/namei.h>
				74	#include <linux/fsnotify.h>
				75	#include <linux/fadvise.h>
				76	#include <linux/eventpoll.h>
				77	#include <linux/splice.h>
				78	#include <linux/task_work.h>
				79	#include <linux/pagemap.h>
				80	#include <linux/io_uring.h>
				81	#include <linux/tracehook.h>
				82
				83	#define CREATE_TRACE_POINTS
				84	#include <trace/events/io_uring.h>
				85
				86	#include <uapi/linux/io_uring.h>
				87
				88	#include "../fs/internal.h"
				89	#include "io-wq.h"
				90
				91	#define IORING_MAX_ENTRIES 32768
				92	#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
				93	#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
				94
				95	/* only define max */
				96	#define IORING_MAX_FIXED_FILES (1U << 15)
				97	#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
				98	IORING_REGISTER_LAST + IORING_OP_LAST)
				99
				100	#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
				101	#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
				102	#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
				103
				104	#define IORING_MAX_REG_BUFFERS (1U << 14)
				105
				106	#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE\|IOSQE_IO_DRAIN\|IOSQE_IO_LINK\| \
				107	IOSQE_IO_HARDLINK \| IOSQE_ASYNC \| \
				108	IOSQE_BUFFER_SELECT)
				109	#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED \| REQ_F_NEED_CLEANUP \| \
				110	REQ_F_POLLED \| REQ_F_INFLIGHT \| REQ_F_CREDS)
				111
				112	#define IO_TCTX_REFS_CACHE_NR (1U << 10)
				113
				114	struct io_uring {
				115	u32 head ____cacheline_aligned_in_smp;
				116	u32 tail ____cacheline_aligned_in_smp;
				117	};
				118
				119	/*
				120	* This data is shared with the application through the mmap at offsets
				121	* IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
				122	*
				123	* The offsets to the member fields are published through struct
				124	* io_sqring_offsets when calling io_uring_setup.
				125	*/
				126	struct io_rings {
				127	/*
				128	* Head and tail offsets into the ring; the offsets need to be
				129	* masked to get valid indices.
				130	*
				131	* The kernel controls head of the sq ring and the tail of the cq ring,
				132	* and the application controls tail of the sq ring and the head of the
				133	* cq ring.
				134	*/
				135	struct io_uring sq, cq;
				136	/*
				137	* Bitmasks to apply to head and tail offsets (constant, equals
				138	* ring_entries - 1)
				139	*/
				140	u32 sq_ring_mask, cq_ring_mask;
				141	/* Ring sizes (constant, power of 2) */
				142	u32 sq_ring_entries, cq_ring_entries;
				143	/*
				144	* Number of invalid entries dropped by the kernel due to
				145	* invalid index stored in array
				146	*
				147	* Written by the kernel, shouldn't be modified by the
				148	* application (i.e. get number of "new events" by comparing to
				149	* cached value).
				150	*
				151	* After a new SQ head value was read by the application this
				152	* counter includes all submissions that were dropped reaching
				153	* the new SQ head (and possibly more).
				154	*/
				155	u32 sq_dropped;
				156	/*
				157	* Runtime SQ flags
				158	*
				159	* Written by the kernel, shouldn't be modified by the
				160	* application.
				161	*
				162	* The application needs a full memory barrier before checking
				163	* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
				164	*/
				165	u32 sq_flags;
				166	/*
				167	* Runtime CQ flags
				168	*
				169	* Written by the application, shouldn't be modified by the
				170	* kernel.
				171	*/
				172	u32 cq_flags;
				173	/*
				174	* Number of completion events lost because the queue was full;
				175	* this should be avoided by the application by making sure
				176	* there are not more requests pending than there is space in
				177	* the completion queue.
				178	*
				179	* Written by the kernel, shouldn't be modified by the
				180	* application (i.e. get number of "new events" by comparing to
				181	* cached value).
				182	*
				183	* As completion events come in out of order this counter is not
				184	* ordered with any other data.
				185	*/
				186	u32 cq_overflow;
				187	/*
				188	* Ring buffer of completion events.
				189	*
				190	* The kernel writes completion events fresh every time they are
				191	* produced, so the application is allowed to modify pending
				192	* entries.
				193	*/
				194	struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
				195	};
				196
				197	enum io_uring_cmd_flags {
				198	IO_URING_F_NONBLOCK = 1,
				199	IO_URING_F_COMPLETE_DEFER = 2,
				200	};
				201
				202	struct io_mapped_ubuf {
				203	u64 ubuf;
				204	u64 ubuf_end;
				205	unsigned int nr_bvecs;
				206	unsigned long acct_pages;
				207	struct bio_vec bvec[];
				208	};
				209
				210	struct io_ring_ctx;
				211
				212	struct io_overflow_cqe {
				213	struct io_uring_cqe cqe;
				214	struct list_head list;
				215	};
				216
				217	struct io_fixed_file {
				218	/* file * with additional FFS_* flags */
				219	unsigned long file_ptr;
				220	};
				221
				222	struct io_rsrc_put {
				223	struct list_head list;
				224	u64 tag;
				225	union {
				226	void *rsrc;
				227	struct file *file;
				228	struct io_mapped_ubuf *buf;
				229	};
				230	};
				231
				232	struct io_file_table {
				233	struct io_fixed_file *files;
				234	};
				235
				236	struct io_rsrc_node {
				237	struct percpu_ref refs;
				238	struct list_head node;
				239	struct list_head rsrc_list;
				240	struct io_rsrc_data *rsrc_data;
				241	struct llist_node llist;
				242	bool done;
				243	};
				244
				245	typedef void (rsrc_put_fn)(struct io_ring_ctx ctx, struct io_rsrc_put prsrc);
				246
				247	struct io_rsrc_data {
				248	struct io_ring_ctx *ctx;
				249
				250	u64 **tags;
				251	unsigned int nr;
				252	rsrc_put_fn *do_put;
				253	atomic_t refs;
				254	struct completion done;
				255	bool quiesce;
				256	};
				257
				258	struct io_buffer {
				259	struct list_head list;
				260	__u64 addr;
				261	__u32 len;
				262	__u16 bid;
				263	};
				264
				265	struct io_restriction {
				266	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
				267	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
				268	u8 sqe_flags_allowed;
				269	u8 sqe_flags_required;
				270	bool registered;
				271	};
				272
				273	enum {
				274	IO_SQ_THREAD_SHOULD_STOP = 0,
				275	IO_SQ_THREAD_SHOULD_PARK,
				276	};
				277
				278	struct io_sq_data {
				279	refcount_t refs;
				280	atomic_t park_pending;
				281	struct mutex lock;
				282
				283	/* ctx's that are using this sqd */
				284	struct list_head ctx_list;
				285
				286	struct task_struct *thread;
				287	struct wait_queue_head wait;
				288
				289	unsigned sq_thread_idle;
				290	int sq_cpu;
				291	pid_t task_pid;
				292	pid_t task_tgid;
				293
				294	unsigned long state;
				295	struct completion exited;
				296	};
				297
				298	#define IO_COMPL_BATCH 32
				299	#define IO_REQ_CACHE_SIZE 32
				300	#define IO_REQ_ALLOC_BATCH 8
				301
				302	struct io_submit_link {
				303	struct io_kiocb *head;
				304	struct io_kiocb *last;
				305	};
				306
				307	struct io_submit_state {
				308	struct blk_plug plug;
				309	struct io_submit_link link;
				310
				311	/*
				312	* io_kiocb alloc cache
				313	*/
				314	void *reqs[IO_REQ_CACHE_SIZE];
				315	unsigned int free_reqs;
				316
				317	bool plug_started;
				318
				319	/*
				320	* Batch completion logic
				321	*/
				322	struct io_kiocb *compl_reqs[IO_COMPL_BATCH];
				323	unsigned int compl_nr;
				324	/* inline/task_work completion list, under ->uring_lock */
				325	struct list_head free_list;
				326
				327	unsigned int ios_left;
				328	};
				329
				330	struct io_ring_ctx {
				331	/* const or read-mostly hot data */
				332	struct {
				333	struct percpu_ref refs;
				334
				335	struct io_rings *rings;
				336	unsigned int flags;
				337	unsigned int compat: 1;
				338	unsigned int drain_next: 1;
				339	unsigned int eventfd_async: 1;
				340	unsigned int restricted: 1;
				341	unsigned int off_timeout_used: 1;
				342	unsigned int drain_active: 1;
				343	} ____cacheline_aligned_in_smp;
				344
				345	/* submission data */
				346	struct {
				347	struct mutex uring_lock;
				348
				349	/*
				350	* Ring buffer of indices into array of io_uring_sqe, which is
				351	* mmapped by the application using the IORING_OFF_SQES offset.
				352	*
				353	* This indirection could e.g. be used to assign fixed
				354	* io_uring_sqe entries to operations and only submit them to
				355	* the queue when needed.
				356	*
				357	* The kernel modifies neither the indices array nor the entries
				358	* array.
				359	*/
				360	u32 *sq_array;
				361	struct io_uring_sqe *sq_sqes;
				362	unsigned cached_sq_head;
				363	unsigned sq_entries;
				364	struct list_head defer_list;
				365
				366	/*
				367	* Fixed resources fast path, should be accessed only under
				368	* uring_lock, and updated through io_uring_register(2)
				369	*/
				370	struct io_rsrc_node *rsrc_node;
				371	struct io_file_table file_table;
				372	unsigned nr_user_files;
				373	unsigned nr_user_bufs;
				374	struct io_mapped_ubuf **user_bufs;
				375
				376	struct io_submit_state submit_state;
				377	struct list_head timeout_list;
				378	struct list_head ltimeout_list;
				379	struct list_head cq_overflow_list;
				380	struct xarray io_buffers;
				381	struct xarray personalities;
				382	u32 pers_next;
				383	unsigned sq_thread_idle;
				384	} ____cacheline_aligned_in_smp;
				385
				386	/* IRQ completion list, under ->completion_lock */
				387	struct list_head locked_free_list;
				388	unsigned int locked_free_nr;
				389
				390	const struct cred sq_creds; / cred used for __io_sq_thread() */
				391	struct io_sq_data sq_data; / if using sq thread polling */
				392
				393	struct wait_queue_head sqo_sq_wait;
				394	struct list_head sqd_list;
				395
				396	unsigned long check_cq_overflow;
				397
				398	struct {
				399	unsigned cached_cq_tail;
				400	unsigned cq_entries;
				401	struct eventfd_ctx *cq_ev_fd;
				402	struct wait_queue_head poll_wait;
				403	struct wait_queue_head cq_wait;
				404	unsigned cq_extra;
				405	atomic_t cq_timeouts;
				406	unsigned cq_last_tm_flush;
				407	} ____cacheline_aligned_in_smp;
				408
				409	struct {
				410	spinlock_t completion_lock;
				411
				412	spinlock_t timeout_lock;
				413
				414	/*
				415	* ->iopoll_list is protected by the ctx->uring_lock for
				416	* io_uring instances that don't use IORING_SETUP_SQPOLL.
				417	* For SQPOLL, only the single threaded io_sq_thread() will
				418	* manipulate the list, hence no extra locking is needed there.
				419	*/
				420	struct list_head iopoll_list;
				421	struct hlist_head *cancel_hash;
				422	unsigned cancel_hash_bits;
				423	bool poll_multi_queue;
				424	} ____cacheline_aligned_in_smp;
				425
				426	struct io_restriction restrictions;
				427
				428	/* slow path rsrc auxilary data, used by update/register */
				429	struct {
				430	struct io_rsrc_node *rsrc_backup_node;
				431	struct io_mapped_ubuf *dummy_ubuf;
				432	struct io_rsrc_data *file_data;
				433	struct io_rsrc_data *buf_data;
				434
				435	struct delayed_work rsrc_put_work;
				436	struct llist_head rsrc_put_llist;
				437	struct list_head rsrc_ref_list;
				438	spinlock_t rsrc_ref_lock;
				439	};
				440
				441	/* Keep this last, we don't need it for the fast path */
				442	struct {
				443	#if defined(CONFIG_UNIX)
				444	struct socket *ring_sock;
				445	#endif
				446	/* hashed buffered write serialization */
				447	struct io_wq_hash *hash_map;
				448
				449	/* Only used for accounting purposes */
				450	struct user_struct *user;
				451	struct mm_struct *mm_account;
				452
				453	/* ctx exit and cancelation */
				454	struct llist_head fallback_llist;
				455	struct delayed_work fallback_work;
				456	struct work_struct exit_work;
				457	struct list_head tctx_list;
				458	struct completion ref_comp;
				459	u32 iowq_limits[2];
				460	bool iowq_limits_set;
				461	};
				462	};
				463
Greg Kroah-Hartman	302ed29	2023-01-11 14:46:15 +0000	[diff] [blame]	464	#ifndef __GENKSYMS__
				465	/*
				466	* ANDROID ABI HACK
				467	*
				468	* See the big comment in the linux/io_uring.h file for details. This
				469	* structure definition should NOT be used if __GENKSYMS__ is enabled,
				470	* as a "fake" structure definition has already been read in the
				471	* linux/io_uring.h file in order to preserve the Android kernel ABI.
				472	*/
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	473	struct io_uring_task {
				474	/* submission side */
				475	int cached_refs;
				476	struct xarray xa;
				477	struct wait_queue_head wait;
				478	const struct io_ring_ctx *last;
				479	struct io_wq *io_wq;
				480	struct percpu_counter inflight;
				481	atomic_t inflight_tracked;
				482	atomic_t in_idle;
				483
				484	spinlock_t task_lock;
				485	struct io_wq_work_list task_list;
				486	struct callback_head task_work;
				487	bool task_running;
				488	};
Greg Kroah-Hartman	302ed29	2023-01-11 14:46:15 +0000	[diff] [blame]	489	#endif
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	490
				491	/*
				492	* First field must be the file pointer in all the
				493	* iocb unions! See also 'struct kiocb' in <linux/fs.h>
				494	*/
				495	struct io_poll_iocb {
				496	struct file *file;
				497	struct wait_queue_head *head;
				498	__poll_t events;
				499	struct wait_queue_entry wait;
				500	};
				501
				502	struct io_poll_update {
				503	struct file *file;
				504	u64 old_user_data;
				505	u64 new_user_data;
				506	__poll_t events;
				507	bool update_events;
				508	bool update_user_data;
				509	};
				510
				511	struct io_close {
				512	struct file *file;
				513	int fd;
				514	u32 file_slot;
				515	};
				516
				517	struct io_timeout_data {
				518	struct io_kiocb *req;
				519	struct hrtimer timer;
				520	struct timespec64 ts;
				521	enum hrtimer_mode mode;
				522	u32 flags;
				523	};
				524
				525	struct io_accept {
				526	struct file *file;
				527	struct sockaddr __user *addr;
				528	int __user *addr_len;
				529	int flags;
				530	u32 file_slot;
				531	unsigned long nofile;
				532	};
				533
				534	struct io_sync {
				535	struct file *file;
				536	loff_t len;
				537	loff_t off;
				538	int flags;
				539	int mode;
				540	};
				541
				542	struct io_cancel {
				543	struct file *file;
				544	u64 addr;
				545	};
				546
				547	struct io_timeout {
				548	struct file *file;
				549	u32 off;
				550	u32 target_seq;
				551	struct list_head list;
				552	/* head of the link, used by linked timeouts only */
				553	struct io_kiocb *head;
				554	/* for linked completions */
				555	struct io_kiocb *prev;
				556	};
				557
				558	struct io_timeout_rem {
				559	struct file *file;
				560	u64 addr;
				561
				562	/* timeout update */
				563	struct timespec64 ts;
				564	u32 flags;
				565	bool ltimeout;
				566	};
				567
				568	struct io_rw {
				569	/* NOTE: kiocb has the file as the first member, so don't do it here */
				570	struct kiocb kiocb;
				571	u64 addr;
				572	u64 len;
				573	};
				574
				575	struct io_connect {
				576	struct file *file;
				577	struct sockaddr __user *addr;
				578	int addr_len;
				579	};
				580
				581	struct io_sr_msg {
				582	struct file *file;
				583	union {
				584	struct compat_msghdr __user *umsg_compat;
				585	struct user_msghdr __user *umsg;
				586	void __user *buf;
				587	};
				588	int msg_flags;
				589	int bgid;
				590	size_t len;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	591	size_t done_io;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	592	struct io_buffer *kbuf;
				593	};
				594
				595	struct io_open {
				596	struct file *file;
				597	int dfd;
				598	u32 file_slot;
				599	struct filename *filename;
				600	struct open_how how;
				601	unsigned long nofile;
				602	};
				603
				604	struct io_rsrc_update {
				605	struct file *file;
				606	u64 arg;
				607	u32 nr_args;
				608	u32 offset;
				609	};
				610
				611	struct io_fadvise {
				612	struct file *file;
				613	u64 offset;
				614	u32 len;
				615	u32 advice;
				616	};
				617
				618	struct io_madvise {
				619	struct file *file;
				620	u64 addr;
				621	u32 len;
				622	u32 advice;
				623	};
				624
				625	struct io_epoll {
				626	struct file *file;
				627	int epfd;
				628	int op;
				629	int fd;
				630	struct epoll_event event;
				631	};
				632
				633	struct io_splice {
				634	struct file *file_out;
				635	loff_t off_out;
				636	loff_t off_in;
				637	u64 len;
				638	int splice_fd_in;
				639	unsigned int flags;
				640	};
				641
				642	struct io_provide_buf {
				643	struct file *file;
				644	__u64 addr;
				645	__u32 len;
				646	__u32 bgid;
				647	__u16 nbufs;
				648	__u16 bid;
				649	};
				650
				651	struct io_statx {
				652	struct file *file;
				653	int dfd;
				654	unsigned int mask;
				655	unsigned int flags;
				656	const char __user *filename;
				657	struct statx __user *buffer;
				658	};
				659
				660	struct io_shutdown {
				661	struct file *file;
				662	int how;
				663	};
				664
				665	struct io_rename {
				666	struct file *file;
				667	int old_dfd;
				668	int new_dfd;
				669	struct filename *oldpath;
				670	struct filename *newpath;
				671	int flags;
				672	};
				673
				674	struct io_unlink {
				675	struct file *file;
				676	int dfd;
				677	int flags;
				678	struct filename *filename;
				679	};
				680
				681	struct io_mkdir {
				682	struct file *file;
				683	int dfd;
				684	umode_t mode;
				685	struct filename *filename;
				686	};
				687
				688	struct io_symlink {
				689	struct file *file;
				690	int new_dfd;
				691	struct filename *oldpath;
				692	struct filename *newpath;
				693	};
				694
				695	struct io_hardlink {
				696	struct file *file;
				697	int old_dfd;
				698	int new_dfd;
				699	struct filename *oldpath;
				700	struct filename *newpath;
				701	int flags;
				702	};
				703
				704	struct io_completion {
				705	struct file *file;
				706	u32 cflags;
				707	};
				708
				709	struct io_async_connect {
				710	struct sockaddr_storage address;
				711	};
				712
				713	struct io_async_msghdr {
				714	struct iovec fast_iov[UIO_FASTIOV];
				715	/* points to an allocated iov, if NULL we use fast_iov instead */
				716	struct iovec *free_iov;
				717	struct sockaddr __user *uaddr;
				718	struct msghdr msg;
				719	struct sockaddr_storage addr;
				720	};
				721
				722	struct io_async_rw {
				723	struct iovec fast_iov[UIO_FASTIOV];
				724	const struct iovec *free_iovec;
				725	struct iov_iter iter;
				726	struct iov_iter_state iter_state;
				727	size_t bytes_done;
				728	struct wait_page_queue wpq;
				729	};
				730
				731	enum {
				732	REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
				733	REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
				734	REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
				735	REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
				736	REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
				737	REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
				738
				739	/* first byte is taken by user flags, shift it to not overlap */
				740	REQ_F_FAIL_BIT = 8,
				741	REQ_F_INFLIGHT_BIT,
				742	REQ_F_CUR_POS_BIT,
				743	REQ_F_NOWAIT_BIT,
				744	REQ_F_LINK_TIMEOUT_BIT,
				745	REQ_F_NEED_CLEANUP_BIT,
				746	REQ_F_POLLED_BIT,
				747	REQ_F_BUFFER_SELECTED_BIT,
				748	REQ_F_COMPLETE_INLINE_BIT,
				749	REQ_F_REISSUE_BIT,
				750	REQ_F_CREDS_BIT,
				751	REQ_F_REFCOUNT_BIT,
				752	REQ_F_ARM_LTIMEOUT_BIT,
Jens Axboe	c7d8511	2022-03-23 09:30:05 -0600	[diff] [blame]	753	REQ_F_PARTIAL_IO_BIT,
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	754	/* keep async read/write and isreg together and in order */
				755	REQ_F_NOWAIT_READ_BIT,
				756	REQ_F_NOWAIT_WRITE_BIT,
				757	REQ_F_ISREG_BIT,
				758
				759	/* not a real bit, just to check we're not overflowing the space */
				760	__REQ_F_LAST_BIT,
				761	};
				762
				763	enum {
				764	/* ctx owns file */
				765	REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
				766	/* drain existing IO first */
				767	REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
				768	/* linked sqes */
				769	REQ_F_LINK = BIT(REQ_F_LINK_BIT),
				770	/* doesn't sever on completion < 0 */
				771	REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
				772	/* IOSQE_ASYNC */
				773	REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
				774	/* IOSQE_BUFFER_SELECT */
				775	REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
				776
				777	/* fail rest of links */
				778	REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
				779	/* on inflight list, should be cancelled and waited on exit reliably */
				780	REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
				781	/* read/write uses file position */
				782	REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
				783	/* must not punt to workers */
				784	REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
				785	/* has or had linked timeout */
				786	REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
				787	/* needs cleanup */
				788	REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
				789	/* already went through poll handler */
				790	REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
				791	/* buffer already selected */
				792	REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
				793	/* completion is deferred through io_comp_state */
				794	REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
				795	/* caller should reissue async */
				796	REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
				797	/* supports async reads */
				798	REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
				799	/* supports async writes */
				800	REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT),
				801	/* regular file */
				802	REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
				803	/* has creds assigned */
				804	REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
				805	/* skip refcounting if not set */
				806	REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
				807	/* there is a linked timeout that has to be armed */
				808	REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
Jens Axboe	c7d8511	2022-03-23 09:30:05 -0600	[diff] [blame]	809	/* request has already done partial IO */
				810	REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	811	};
				812
				813	struct async_poll {
				814	struct io_poll_iocb poll;
				815	struct io_poll_iocb *double_poll;
				816	};
				817
				818	typedef void (io_req_tw_func_t)(struct io_kiocb req, bool *locked);
				819
				820	struct io_task_work {
				821	union {
				822	struct io_wq_work_node node;
				823	struct llist_node fallback_node;
				824	};
				825	io_req_tw_func_t func;
				826	};
				827
				828	enum {
				829	IORING_RSRC_FILE = 0,
				830	IORING_RSRC_BUFFER = 1,
				831	};
				832
				833	/*
				834	* NOTE! Each of the iocb union members has the file pointer
				835	* as the first entry in their struct definition. So you can
				836	* access the file pointer through any of the sub-structs,
				837	* or directly as just 'ki_filp' in this struct.
				838	*/
				839	struct io_kiocb {
				840	union {
				841	struct file *file;
				842	struct io_rw rw;
				843	struct io_poll_iocb poll;
				844	struct io_poll_update poll_update;
				845	struct io_accept accept;
				846	struct io_sync sync;
				847	struct io_cancel cancel;
				848	struct io_timeout timeout;
				849	struct io_timeout_rem timeout_rem;
				850	struct io_connect connect;
				851	struct io_sr_msg sr_msg;
				852	struct io_open open;
				853	struct io_close close;
				854	struct io_rsrc_update rsrc_update;
				855	struct io_fadvise fadvise;
				856	struct io_madvise madvise;
				857	struct io_epoll epoll;
				858	struct io_splice splice;
				859	struct io_provide_buf pbuf;
				860	struct io_statx statx;
				861	struct io_shutdown shutdown;
				862	struct io_rename rename;
				863	struct io_unlink unlink;
				864	struct io_mkdir mkdir;
				865	struct io_symlink symlink;
				866	struct io_hardlink hardlink;
				867	/* use only after cleaning per-op data, see io_clean_op() */
				868	struct io_completion compl;
				869	};
				870
				871	/* opcode allocated if it needs to store data for async defer */
				872	void *async_data;
				873	u8 opcode;
				874	/* polled IO has completed */
				875	u8 iopoll_completed;
				876
				877	u16 buf_index;
				878	u32 result;
				879
				880	struct io_ring_ctx *ctx;
				881	unsigned int flags;
				882	atomic_t refs;
				883	struct task_struct *task;
				884	u64 user_data;
				885
				886	struct io_kiocb *link;
				887	struct percpu_ref *fixed_rsrc_refs;
				888
				889	/* used with ctx->iopoll_list with reads/writes */
				890	struct list_head inflight_entry;
				891	struct io_task_work io_task_work;
				892	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
				893	struct hlist_node hash_node;
				894	struct async_poll *apoll;
				895	struct io_wq_work work;
				896	const struct cred *creds;
				897
				898	/* store used ubuf, so we can prevent reloading */
				899	struct io_mapped_ubuf *imu;
				900	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
				901	struct io_buffer *kbuf;
				902	atomic_t poll_refs;
				903	};
				904
				905	struct io_tctx_node {
				906	struct list_head ctx_node;
				907	struct task_struct *task;
				908	struct io_ring_ctx *ctx;
				909	};
				910
				911	struct io_defer_entry {
				912	struct list_head list;
				913	struct io_kiocb *req;
				914	u32 seq;
				915	};
				916
				917	struct io_op_def {
				918	/* needs req->file assigned */
				919	unsigned needs_file : 1;
				920	/* hash wq insertion if file is a regular file */
				921	unsigned hash_reg_file : 1;
				922	/* unbound wq insertion if file is a non-regular file */
				923	unsigned unbound_nonreg_file : 1;
				924	/* opcode is not supported by this kernel */
				925	unsigned not_supported : 1;
				926	/* set if opcode supports polled "wait" */
				927	unsigned pollin : 1;
				928	unsigned pollout : 1;
				929	/* op supports buffer selection */
				930	unsigned buffer_select : 1;
				931	/* do prep async if is going to be punted */
				932	unsigned needs_async_setup : 1;
				933	/* should block plug */
				934	unsigned plug : 1;
				935	/* size of async data needed, if any */
				936	unsigned short async_size;
				937	};
				938
				939	static const struct io_op_def io_op_defs[] = {
				940	[IORING_OP_NOP] = {},
				941	[IORING_OP_READV] = {
				942	.needs_file = 1,
				943	.unbound_nonreg_file = 1,
				944	.pollin = 1,
				945	.buffer_select = 1,
				946	.needs_async_setup = 1,
				947	.plug = 1,
				948	.async_size = sizeof(struct io_async_rw),
				949	},
				950	[IORING_OP_WRITEV] = {
				951	.needs_file = 1,
				952	.hash_reg_file = 1,
				953	.unbound_nonreg_file = 1,
				954	.pollout = 1,
				955	.needs_async_setup = 1,
				956	.plug = 1,
				957	.async_size = sizeof(struct io_async_rw),
				958	},
				959	[IORING_OP_FSYNC] = {
				960	.needs_file = 1,
				961	},
				962	[IORING_OP_READ_FIXED] = {
				963	.needs_file = 1,
				964	.unbound_nonreg_file = 1,
				965	.pollin = 1,
				966	.plug = 1,
				967	.async_size = sizeof(struct io_async_rw),
				968	},
				969	[IORING_OP_WRITE_FIXED] = {
				970	.needs_file = 1,
				971	.hash_reg_file = 1,
				972	.unbound_nonreg_file = 1,
				973	.pollout = 1,
				974	.plug = 1,
				975	.async_size = sizeof(struct io_async_rw),
				976	},
				977	[IORING_OP_POLL_ADD] = {
				978	.needs_file = 1,
				979	.unbound_nonreg_file = 1,
				980	},
				981	[IORING_OP_POLL_REMOVE] = {},
				982	[IORING_OP_SYNC_FILE_RANGE] = {
				983	.needs_file = 1,
				984	},
				985	[IORING_OP_SENDMSG] = {
				986	.needs_file = 1,
				987	.unbound_nonreg_file = 1,
				988	.pollout = 1,
				989	.needs_async_setup = 1,
				990	.async_size = sizeof(struct io_async_msghdr),
				991	},
				992	[IORING_OP_RECVMSG] = {
				993	.needs_file = 1,
				994	.unbound_nonreg_file = 1,
				995	.pollin = 1,
				996	.buffer_select = 1,
				997	.needs_async_setup = 1,
				998	.async_size = sizeof(struct io_async_msghdr),
				999	},
				1000	[IORING_OP_TIMEOUT] = {
				1001	.async_size = sizeof(struct io_timeout_data),
				1002	},
				1003	[IORING_OP_TIMEOUT_REMOVE] = {
				1004	/* used by timeout updates' prep() */
				1005	},
				1006	[IORING_OP_ACCEPT] = {
				1007	.needs_file = 1,
				1008	.unbound_nonreg_file = 1,
				1009	.pollin = 1,
				1010	},
				1011	[IORING_OP_ASYNC_CANCEL] = {},
				1012	[IORING_OP_LINK_TIMEOUT] = {
				1013	.async_size = sizeof(struct io_timeout_data),
				1014	},
				1015	[IORING_OP_CONNECT] = {
				1016	.needs_file = 1,
				1017	.unbound_nonreg_file = 1,
				1018	.pollout = 1,
				1019	.needs_async_setup = 1,
				1020	.async_size = sizeof(struct io_async_connect),
				1021	},
				1022	[IORING_OP_FALLOCATE] = {
				1023	.needs_file = 1,
				1024	},
				1025	[IORING_OP_OPENAT] = {},
				1026	[IORING_OP_CLOSE] = {},
				1027	[IORING_OP_FILES_UPDATE] = {},
				1028	[IORING_OP_STATX] = {},
				1029	[IORING_OP_READ] = {
				1030	.needs_file = 1,
				1031	.unbound_nonreg_file = 1,
				1032	.pollin = 1,
				1033	.buffer_select = 1,
				1034	.plug = 1,
				1035	.async_size = sizeof(struct io_async_rw),
				1036	},
				1037	[IORING_OP_WRITE] = {
				1038	.needs_file = 1,
				1039	.hash_reg_file = 1,
				1040	.unbound_nonreg_file = 1,
				1041	.pollout = 1,
				1042	.plug = 1,
				1043	.async_size = sizeof(struct io_async_rw),
				1044	},
				1045	[IORING_OP_FADVISE] = {
				1046	.needs_file = 1,
				1047	},
				1048	[IORING_OP_MADVISE] = {},
				1049	[IORING_OP_SEND] = {
				1050	.needs_file = 1,
				1051	.unbound_nonreg_file = 1,
				1052	.pollout = 1,
				1053	},
				1054	[IORING_OP_RECV] = {
				1055	.needs_file = 1,
				1056	.unbound_nonreg_file = 1,
				1057	.pollin = 1,
				1058	.buffer_select = 1,
				1059	},
				1060	[IORING_OP_OPENAT2] = {
				1061	},
				1062	[IORING_OP_EPOLL_CTL] = {
				1063	.unbound_nonreg_file = 1,
				1064	},
				1065	[IORING_OP_SPLICE] = {
				1066	.needs_file = 1,
				1067	.hash_reg_file = 1,
				1068	.unbound_nonreg_file = 1,
				1069	},
				1070	[IORING_OP_PROVIDE_BUFFERS] = {},
				1071	[IORING_OP_REMOVE_BUFFERS] = {},
				1072	[IORING_OP_TEE] = {
				1073	.needs_file = 1,
				1074	.hash_reg_file = 1,
				1075	.unbound_nonreg_file = 1,
				1076	},
				1077	[IORING_OP_SHUTDOWN] = {
				1078	.needs_file = 1,
				1079	},
				1080	[IORING_OP_RENAMEAT] = {},
				1081	[IORING_OP_UNLINKAT] = {},
				1082	};
				1083
				1084	/* requests with any of those set should undergo io_disarm_next() */
				1085	#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT \| REQ_F_LINK_TIMEOUT \| REQ_F_FAIL)
				1086
				1087	static bool io_disarm_next(struct io_kiocb *req);
				1088	static void io_uring_del_tctx_node(unsigned long index);
				1089	static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
				1090	struct task_struct *task,
				1091	bool cancel_all);
				1092	static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
				1093
				1094	static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
				1095
				1096	static void io_put_req(struct io_kiocb *req);
				1097	static void io_put_req_deferred(struct io_kiocb *req);
				1098	static void io_dismantle_req(struct io_kiocb *req);
				1099	static void io_queue_linked_timeout(struct io_kiocb *req);
				1100	static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
				1101	struct io_uring_rsrc_update2 *up,
				1102	unsigned nr_args);
				1103	static void io_clean_op(struct io_kiocb *req);
				1104	static struct file io_file_get(struct io_ring_ctx ctx,
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	1105	struct io_kiocb *req, int fd, bool fixed,
				1106	unsigned int issue_flags);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1107	static void __io_queue_sqe(struct io_kiocb *req);
				1108	static void io_rsrc_put_work(struct work_struct *work);
				1109
				1110	static void io_req_task_queue(struct io_kiocb *req);
				1111	static void io_submit_flush_completions(struct io_ring_ctx *ctx);
				1112	static int io_req_prep_async(struct io_kiocb *req);
				1113
				1114	static int io_install_fixed_file(struct io_kiocb req, struct file file,
				1115	unsigned int issue_flags, u32 slot_index);
				1116	static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
				1117
				1118	static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
				1119
				1120	static struct kmem_cache *req_cachep;
				1121
				1122	static const struct file_operations io_uring_fops;
				1123
				1124	struct sock io_uring_get_socket(struct file file)
				1125	{
				1126	#if defined(CONFIG_UNIX)
				1127	if (file->f_op == &io_uring_fops) {
				1128	struct io_ring_ctx *ctx = file->private_data;
				1129
				1130	return ctx->ring_sock->sk;
				1131	}
				1132	#endif
				1133	return NULL;
				1134	}
				1135	EXPORT_SYMBOL(io_uring_get_socket);
				1136
				1137	static inline void io_tw_lock(struct io_ring_ctx ctx, bool locked)
				1138	{
				1139	if (!*locked) {
				1140	mutex_lock(&ctx->uring_lock);
				1141	*locked = true;
				1142	}
				1143	}
				1144
				1145	#define io_for_each_link(pos, head) \
				1146	for (pos = (head); pos; pos = pos->link)
				1147
				1148	/*
				1149	* Shamelessly stolen from the mm implementation of page reference checking,
				1150	* see commit f958d7b528b1 for details.
				1151	*/
				1152	#define req_ref_zero_or_close_to_overflow(req) \
				1153	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
				1154
				1155	static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
				1156	{
				1157	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
				1158	return atomic_inc_not_zero(&req->refs);
				1159	}
				1160
				1161	static inline bool req_ref_put_and_test(struct io_kiocb *req)
				1162	{
				1163	if (likely(!(req->flags & REQ_F_REFCOUNT)))
				1164	return true;
				1165
				1166	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
				1167	return atomic_dec_and_test(&req->refs);
				1168	}
				1169
				1170	static inline void req_ref_get(struct io_kiocb *req)
				1171	{
				1172	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
				1173	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
				1174	atomic_inc(&req->refs);
				1175	}
				1176
				1177	static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
				1178	{
				1179	if (!(req->flags & REQ_F_REFCOUNT)) {
				1180	req->flags \|= REQ_F_REFCOUNT;
				1181	atomic_set(&req->refs, nr);
				1182	}
				1183	}
				1184
				1185	static inline void io_req_set_refcount(struct io_kiocb *req)
				1186	{
				1187	__io_req_set_refcount(req, 1);
				1188	}
				1189
				1190	static inline void io_req_set_rsrc_node(struct io_kiocb *req)
				1191	{
				1192	struct io_ring_ctx *ctx = req->ctx;
				1193
				1194	if (!req->fixed_rsrc_refs) {
				1195	req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
				1196	percpu_ref_get(req->fixed_rsrc_refs);
				1197	}
				1198	}
				1199
				1200	static void io_refs_resurrect(struct percpu_ref ref, struct completion compl)
				1201	{
				1202	bool got = percpu_ref_tryget(ref);
				1203
				1204	/* already at zero, wait for ->release() */
				1205	if (!got)
				1206	wait_for_completion(compl);
				1207	percpu_ref_resurrect(ref);
				1208	if (got)
				1209	percpu_ref_put(ref);
				1210	}
				1211
				1212	static bool io_match_task(struct io_kiocb head, struct task_struct task,
				1213	bool cancel_all)
				1214	__must_hold(&req->ctx->timeout_lock)
				1215	{
				1216	struct io_kiocb *req;
				1217
				1218	if (task && head->task != task)
				1219	return false;
				1220	if (cancel_all)
				1221	return true;
				1222
				1223	io_for_each_link(req, head) {
				1224	if (req->flags & REQ_F_INFLIGHT)
				1225	return true;
				1226	}
				1227	return false;
				1228	}
				1229
				1230	static bool io_match_linked(struct io_kiocb *head)
				1231	{
				1232	struct io_kiocb *req;
				1233
				1234	io_for_each_link(req, head) {
				1235	if (req->flags & REQ_F_INFLIGHT)
				1236	return true;
				1237	}
				1238	return false;
				1239	}
				1240
				1241	/*
				1242	* As io_match_task() but protected against racing with linked timeouts.
				1243	* User must not hold timeout_lock.
				1244	*/
				1245	static bool io_match_task_safe(struct io_kiocb head, struct task_struct task,
				1246	bool cancel_all)
				1247	{
				1248	bool matched;
				1249
				1250	if (task && head->task != task)
				1251	return false;
				1252	if (cancel_all)
				1253	return true;
				1254
				1255	if (head->flags & REQ_F_LINK_TIMEOUT) {
				1256	struct io_ring_ctx *ctx = head->ctx;
				1257
				1258	/* protect against races with linked timeouts */
				1259	spin_lock_irq(&ctx->timeout_lock);
				1260	matched = io_match_linked(head);
				1261	spin_unlock_irq(&ctx->timeout_lock);
				1262	} else {
				1263	matched = io_match_linked(head);
				1264	}
				1265	return matched;
				1266	}
				1267
				1268	static inline void req_set_fail(struct io_kiocb *req)
				1269	{
				1270	req->flags \|= REQ_F_FAIL;
				1271	}
				1272
				1273	static inline void req_fail_link_node(struct io_kiocb *req, int res)
				1274	{
				1275	req_set_fail(req);
				1276	req->result = res;
				1277	}
				1278
				1279	static void io_ring_ctx_ref_free(struct percpu_ref *ref)
				1280	{
				1281	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
				1282
				1283	complete(&ctx->ref_comp);
				1284	}
				1285
				1286	static inline bool io_is_timeout_noseq(struct io_kiocb *req)
				1287	{
				1288	return !req->timeout.off;
				1289	}
				1290
				1291	static void io_fallback_req_func(struct work_struct *work)
				1292	{
				1293	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
				1294	fallback_work.work);
				1295	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
				1296	struct io_kiocb req, tmp;
				1297	bool locked = false;
				1298
				1299	percpu_ref_get(&ctx->refs);
				1300	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
				1301	req->io_task_work.func(req, &locked);
				1302
				1303	if (locked) {
				1304	if (ctx->submit_state.compl_nr)
				1305	io_submit_flush_completions(ctx);
				1306	mutex_unlock(&ctx->uring_lock);
				1307	}
				1308	percpu_ref_put(&ctx->refs);
				1309
				1310	}
				1311
				1312	static struct io_ring_ctx io_ring_ctx_alloc(struct io_uring_params p)
				1313	{
				1314	struct io_ring_ctx *ctx;
				1315	int hash_bits;
				1316
				1317	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				1318	if (!ctx)
				1319	return NULL;
				1320
				1321	/*
				1322	* Use 5 bits less than the max cq entries, that should give us around
				1323	* 32 entries per hash list if totally full and uniformly spread.
				1324	*/
				1325	hash_bits = ilog2(p->cq_entries);
				1326	hash_bits -= 5;
				1327	if (hash_bits <= 0)
				1328	hash_bits = 1;
				1329	ctx->cancel_hash_bits = hash_bits;
				1330	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
				1331	GFP_KERNEL);
				1332	if (!ctx->cancel_hash)
				1333	goto err;
				1334	__hash_init(ctx->cancel_hash, 1U << hash_bits);
				1335
				1336	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
				1337	if (!ctx->dummy_ubuf)
				1338	goto err;
				1339	/* set invalid range, so io_import_fixed() fails meeting it */
				1340	ctx->dummy_ubuf->ubuf = -1UL;
				1341
				1342	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
				1343	PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
				1344	goto err;
				1345
				1346	ctx->flags = p->flags;
				1347	init_waitqueue_head(&ctx->sqo_sq_wait);
				1348	INIT_LIST_HEAD(&ctx->sqd_list);
				1349	init_waitqueue_head(&ctx->poll_wait);
				1350	INIT_LIST_HEAD(&ctx->cq_overflow_list);
				1351	init_completion(&ctx->ref_comp);
				1352	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
				1353	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
				1354	mutex_init(&ctx->uring_lock);
				1355	init_waitqueue_head(&ctx->cq_wait);
				1356	spin_lock_init(&ctx->completion_lock);
				1357	spin_lock_init(&ctx->timeout_lock);
				1358	INIT_LIST_HEAD(&ctx->iopoll_list);
				1359	INIT_LIST_HEAD(&ctx->defer_list);
				1360	INIT_LIST_HEAD(&ctx->timeout_list);
				1361	INIT_LIST_HEAD(&ctx->ltimeout_list);
				1362	spin_lock_init(&ctx->rsrc_ref_lock);
				1363	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
				1364	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
				1365	init_llist_head(&ctx->rsrc_put_llist);
				1366	INIT_LIST_HEAD(&ctx->tctx_list);
				1367	INIT_LIST_HEAD(&ctx->submit_state.free_list);
				1368	INIT_LIST_HEAD(&ctx->locked_free_list);
				1369	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
				1370	return ctx;
				1371	err:
				1372	kfree(ctx->dummy_ubuf);
				1373	kfree(ctx->cancel_hash);
				1374	kfree(ctx);
				1375	return NULL;
				1376	}
				1377
				1378	static void io_account_cq_overflow(struct io_ring_ctx *ctx)
				1379	{
				1380	struct io_rings *r = ctx->rings;
				1381
				1382	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
				1383	ctx->cq_extra--;
				1384	}
				1385
				1386	static bool req_need_defer(struct io_kiocb *req, u32 seq)
				1387	{
				1388	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
				1389	struct io_ring_ctx *ctx = req->ctx;
				1390
				1391	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
				1392	}
				1393
				1394	return false;
				1395	}
				1396
				1397	#define FFS_ASYNC_READ 0x1UL
				1398	#define FFS_ASYNC_WRITE 0x2UL
				1399	#ifdef CONFIG_64BIT
				1400	#define FFS_ISREG 0x4UL
				1401	#else
				1402	#define FFS_ISREG 0x0UL
				1403	#endif
				1404	#define FFS_MASK ~(FFS_ASYNC_READ\|FFS_ASYNC_WRITE\|FFS_ISREG)
				1405
				1406	static inline bool io_req_ffs_set(struct io_kiocb *req)
				1407	{
				1408	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
				1409	}
				1410
				1411	static void io_req_track_inflight(struct io_kiocb *req)
				1412	{
				1413	if (!(req->flags & REQ_F_INFLIGHT)) {
				1414	req->flags \|= REQ_F_INFLIGHT;
				1415	atomic_inc(&req->task->io_uring->inflight_tracked);
				1416	}
				1417	}
				1418
				1419	static struct io_kiocb __io_prep_linked_timeout(struct io_kiocb req)
				1420	{
				1421	if (WARN_ON_ONCE(!req->link))
				1422	return NULL;
				1423
				1424	req->flags &= ~REQ_F_ARM_LTIMEOUT;
				1425	req->flags \|= REQ_F_LINK_TIMEOUT;
				1426
				1427	/* linked timeouts should have two refs once prep'ed */
				1428	io_req_set_refcount(req);
				1429	__io_req_set_refcount(req->link, 2);
				1430	return req->link;
				1431	}
				1432
				1433	static inline struct io_kiocb io_prep_linked_timeout(struct io_kiocb req)
				1434	{
				1435	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
				1436	return NULL;
				1437	return __io_prep_linked_timeout(req);
				1438	}
				1439
				1440	static void io_prep_async_work(struct io_kiocb *req)
				1441	{
				1442	const struct io_op_def *def = &io_op_defs[req->opcode];
				1443	struct io_ring_ctx *ctx = req->ctx;
				1444
				1445	if (!(req->flags & REQ_F_CREDS)) {
				1446	req->flags \|= REQ_F_CREDS;
				1447	req->creds = get_current_cred();
				1448	}
				1449
				1450	req->work.list.next = NULL;
				1451	req->work.flags = 0;
				1452	if (req->flags & REQ_F_FORCE_ASYNC)
				1453	req->work.flags \|= IO_WQ_WORK_CONCURRENT;
				1454
				1455	if (req->flags & REQ_F_ISREG) {
				1456	if (def->hash_reg_file \|\| (ctx->flags & IORING_SETUP_IOPOLL))
				1457	io_wq_hash_work(&req->work, file_inode(req->file));
				1458	} else if (!req->file \|\| !S_ISBLK(file_inode(req->file)->i_mode)) {
				1459	if (def->unbound_nonreg_file)
				1460	req->work.flags \|= IO_WQ_WORK_UNBOUND;
				1461	}
				1462	}
				1463
				1464	static void io_prep_async_link(struct io_kiocb *req)
				1465	{
				1466	struct io_kiocb *cur;
				1467
				1468	if (req->flags & REQ_F_LINK_TIMEOUT) {
				1469	struct io_ring_ctx *ctx = req->ctx;
				1470
				1471	spin_lock_irq(&ctx->timeout_lock);
				1472	io_for_each_link(cur, req)
				1473	io_prep_async_work(cur);
				1474	spin_unlock_irq(&ctx->timeout_lock);
				1475	} else {
				1476	io_for_each_link(cur, req)
				1477	io_prep_async_work(cur);
				1478	}
				1479	}
				1480
				1481	static void io_queue_async_work(struct io_kiocb req, bool locked)
				1482	{
				1483	struct io_ring_ctx *ctx = req->ctx;
				1484	struct io_kiocb *link = io_prep_linked_timeout(req);
				1485	struct io_uring_task *tctx = req->task->io_uring;
				1486
				1487	/* must not take the lock, NULL it as a precaution */
				1488	locked = NULL;
				1489
				1490	BUG_ON(!tctx);
				1491	BUG_ON(!tctx->io_wq);
				1492
				1493	/* init ->work of the whole link before punting */
				1494	io_prep_async_link(req);
				1495
				1496	/*
				1497	* Not expected to happen, but if we do have a bug where this _can_
				1498	* happen, catch it here and ensure the request is marked as
				1499	* canceled. That will make io-wq go through the usual work cancel
				1500	* procedure rather than attempt to run this request (or create a new
				1501	* worker for it).
				1502	*/
				1503	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
				1504	req->work.flags \|= IO_WQ_WORK_CANCEL;
				1505
				1506	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
				1507	&req->work, req->flags);
				1508	io_wq_enqueue(tctx->io_wq, &req->work);
				1509	if (link)
				1510	io_queue_linked_timeout(link);
				1511	}
				1512
				1513	static void io_kill_timeout(struct io_kiocb *req, int status)
				1514	__must_hold(&req->ctx->completion_lock)
				1515	__must_hold(&req->ctx->timeout_lock)
				1516	{
				1517	struct io_timeout_data *io = req->async_data;
				1518
				1519	if (hrtimer_try_to_cancel(&io->timer) != -1) {
				1520	if (status)
				1521	req_set_fail(req);
				1522	atomic_set(&req->ctx->cq_timeouts,
				1523	atomic_read(&req->ctx->cq_timeouts) + 1);
				1524	list_del_init(&req->timeout.list);
				1525	io_fill_cqe_req(req, status, 0);
				1526	io_put_req_deferred(req);
				1527	}
				1528	}
				1529
				1530	static void io_queue_deferred(struct io_ring_ctx *ctx)
				1531	{
				1532	while (!list_empty(&ctx->defer_list)) {
				1533	struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
				1534	struct io_defer_entry, list);
				1535
				1536	if (req_need_defer(de->req, de->seq))
				1537	break;
				1538	list_del_init(&de->list);
				1539	io_req_task_queue(de->req);
				1540	kfree(de);
				1541	}
				1542	}
				1543
				1544	static void io_flush_timeouts(struct io_ring_ctx *ctx)
				1545	__must_hold(&ctx->completion_lock)
				1546	{
				1547	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
				1548	struct io_kiocb req, tmp;
				1549
				1550	spin_lock_irq(&ctx->timeout_lock);
				1551	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
				1552	u32 events_needed, events_got;
				1553
				1554	if (io_is_timeout_noseq(req))
				1555	break;
				1556
				1557	/*
				1558	* Since seq can easily wrap around over time, subtract
				1559	* the last seq at which timeouts were flushed before comparing.
				1560	* Assuming not more than 2^31-1 events have happened since,
				1561	* these subtractions won't have wrapped, so we can check if
				1562	* target is in [last_seq, current_seq] by comparing the two.
				1563	*/
				1564	events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
				1565	events_got = seq - ctx->cq_last_tm_flush;
				1566	if (events_got < events_needed)
				1567	break;
				1568
				1569	io_kill_timeout(req, 0);
				1570	}
				1571	ctx->cq_last_tm_flush = seq;
				1572	spin_unlock_irq(&ctx->timeout_lock);
				1573	}
				1574
				1575	static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
				1576	{
				1577	if (ctx->off_timeout_used)
				1578	io_flush_timeouts(ctx);
				1579	if (ctx->drain_active)
				1580	io_queue_deferred(ctx);
				1581	}
				1582
				1583	static inline void io_commit_cqring(struct io_ring_ctx *ctx)
				1584	{
				1585	if (unlikely(ctx->off_timeout_used \|\| ctx->drain_active))
				1586	__io_commit_cqring_flush(ctx);
				1587	/* order cqe stores with ring update */
				1588	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
				1589	}
				1590
				1591	static inline bool io_sqring_full(struct io_ring_ctx *ctx)
				1592	{
				1593	struct io_rings *r = ctx->rings;
				1594
				1595	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
				1596	}
				1597
				1598	static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
				1599	{
				1600	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
				1601	}
				1602
				1603	static inline struct io_uring_cqe io_get_cqe(struct io_ring_ctx ctx)
				1604	{
				1605	struct io_rings *rings = ctx->rings;
				1606	unsigned tail, mask = ctx->cq_entries - 1;
				1607
				1608	/*
				1609	* writes to the cq entry need to come after reading head; the
				1610	* control dependency is enough as we're using WRITE_ONCE to
				1611	* fill the cq entry
				1612	*/
				1613	if (__io_cqring_events(ctx) == ctx->cq_entries)
				1614	return NULL;
				1615
				1616	tail = ctx->cached_cq_tail++;
				1617	return &rings->cqes[tail & mask];
				1618	}
				1619
				1620	static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
				1621	{
				1622	if (likely(!ctx->cq_ev_fd))
				1623	return false;
				1624	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
				1625	return false;
				1626	return !ctx->eventfd_async \|\| io_wq_current_is_worker();
				1627	}
				1628
				1629	/*
				1630	* This should only get called when at least one event has been posted.
				1631	* Some applications rely on the eventfd notification count only changing
				1632	* IFF a new CQE has been added to the CQ ring. There's no depedency on
				1633	* 1:1 relationship between how many times this function is called (and
				1634	* hence the eventfd count) and number of CQEs posted to the CQ ring.
				1635	*/
				1636	static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
				1637	{
				1638	/*
				1639	* wake_up_all() may seem excessive, but io_wake_function() and
				1640	* io_should_wake() handle the termination of the loop and only
				1641	* wake as many waiters as we need to.
				1642	*/
				1643	if (wq_has_sleeper(&ctx->cq_wait))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1644	__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
				1645	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1646	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
				1647	wake_up(&ctx->sq_data->wait);
				1648	if (io_should_trigger_evfd(ctx))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1649	eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1650	if (waitqueue_active(&ctx->poll_wait))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1651	__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
				1652	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1653	}
				1654
				1655	static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
				1656	{
				1657	/* see waitqueue_active() comment */
				1658	smp_mb();
				1659
				1660	if (ctx->flags & IORING_SETUP_SQPOLL) {
				1661	if (waitqueue_active(&ctx->cq_wait))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1662	__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
				1663	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1664	}
				1665	if (io_should_trigger_evfd(ctx))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1666	eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1667	if (waitqueue_active(&ctx->poll_wait))
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	1668	__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
				1669	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	1670	}
				1671
				1672	/* Returns true if there are no backlogged entries after the flush */
				1673	static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
				1674	{
				1675	bool all_flushed, posted;
				1676
				1677	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
				1678	return false;
				1679
				1680	posted = false;
				1681	spin_lock(&ctx->completion_lock);
				1682	while (!list_empty(&ctx->cq_overflow_list)) {
				1683	struct io_uring_cqe *cqe = io_get_cqe(ctx);
				1684	struct io_overflow_cqe *ocqe;
				1685
				1686	if (!cqe && !force)
				1687	break;
				1688	ocqe = list_first_entry(&ctx->cq_overflow_list,
				1689	struct io_overflow_cqe, list);
				1690	if (cqe)
				1691	memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
				1692	else
				1693	io_account_cq_overflow(ctx);
				1694
				1695	posted = true;
				1696	list_del(&ocqe->list);
				1697	kfree(ocqe);
				1698	}
				1699
				1700	all_flushed = list_empty(&ctx->cq_overflow_list);
				1701	if (all_flushed) {
				1702	clear_bit(0, &ctx->check_cq_overflow);
				1703	WRITE_ONCE(ctx->rings->sq_flags,
				1704	ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
				1705	}
				1706
				1707	if (posted)
				1708	io_commit_cqring(ctx);
				1709	spin_unlock(&ctx->completion_lock);
				1710	if (posted)
				1711	io_cqring_ev_posted(ctx);
				1712	return all_flushed;
				1713	}
				1714
				1715	static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
				1716	{
				1717	bool ret = true;
				1718
				1719	if (test_bit(0, &ctx->check_cq_overflow)) {
				1720	/* iopoll syncs against uring_lock, not completion_lock */
				1721	if (ctx->flags & IORING_SETUP_IOPOLL)
				1722	mutex_lock(&ctx->uring_lock);
				1723	ret = __io_cqring_overflow_flush(ctx, false);
				1724	if (ctx->flags & IORING_SETUP_IOPOLL)
				1725	mutex_unlock(&ctx->uring_lock);
				1726	}
				1727
				1728	return ret;
				1729	}
				1730
				1731	/* must to be called somewhat shortly after putting a request */
				1732	static inline void io_put_task(struct task_struct *task, int nr)
				1733	{
				1734	struct io_uring_task *tctx = task->io_uring;
				1735
				1736	if (likely(task == current)) {
				1737	tctx->cached_refs += nr;
				1738	} else {
				1739	percpu_counter_sub(&tctx->inflight, nr);
				1740	if (unlikely(atomic_read(&tctx->in_idle)))
				1741	wake_up(&tctx->wait);
				1742	put_task_struct_many(task, nr);
				1743	}
				1744	}
				1745
				1746	static void io_task_refs_refill(struct io_uring_task *tctx)
				1747	{
				1748	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
				1749
				1750	percpu_counter_add(&tctx->inflight, refill);
				1751	refcount_add(refill, &current->usage);
				1752	tctx->cached_refs += refill;
				1753	}
				1754
				1755	static inline void io_get_task_refs(int nr)
				1756	{
				1757	struct io_uring_task *tctx = current->io_uring;
				1758
				1759	tctx->cached_refs -= nr;
				1760	if (unlikely(tctx->cached_refs < 0))
				1761	io_task_refs_refill(tctx);
				1762	}
				1763
				1764	static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
				1765	{
				1766	struct io_uring_task *tctx = task->io_uring;
				1767	unsigned int refs = tctx->cached_refs;
				1768
				1769	if (refs) {
				1770	tctx->cached_refs = 0;
				1771	percpu_counter_sub(&tctx->inflight, refs);
				1772	put_task_struct_many(task, refs);
				1773	}
				1774	}
				1775
				1776	static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
				1777	s32 res, u32 cflags)
				1778	{
				1779	struct io_overflow_cqe *ocqe;
				1780
				1781	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC \| __GFP_ACCOUNT);
				1782	if (!ocqe) {
				1783	/*
				1784	* If we're in ring overflow flush mode, or in task cancel mode,
				1785	* or cannot allocate an overflow entry, then we need to drop it
				1786	* on the floor.
				1787	*/
				1788	io_account_cq_overflow(ctx);
				1789	return false;
				1790	}
				1791	if (list_empty(&ctx->cq_overflow_list)) {
				1792	set_bit(0, &ctx->check_cq_overflow);
				1793	WRITE_ONCE(ctx->rings->sq_flags,
				1794	ctx->rings->sq_flags \| IORING_SQ_CQ_OVERFLOW);
				1795
				1796	}
				1797	ocqe->cqe.user_data = user_data;
				1798	ocqe->cqe.res = res;
				1799	ocqe->cqe.flags = cflags;
				1800	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
				1801	return true;
				1802	}
				1803
				1804	static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
				1805	s32 res, u32 cflags)
				1806	{
				1807	struct io_uring_cqe *cqe;
				1808
				1809	trace_io_uring_complete(ctx, user_data, res, cflags);
				1810
				1811	/*
				1812	* If we can't get a cq entry, userspace overflowed the
				1813	* submission (by quite a lot). Increment the overflow count in
				1814	* the ring.
				1815	*/
				1816	cqe = io_get_cqe(ctx);
				1817	if (likely(cqe)) {
				1818	WRITE_ONCE(cqe->user_data, user_data);
				1819	WRITE_ONCE(cqe->res, res);
				1820	WRITE_ONCE(cqe->flags, cflags);
				1821	return true;
				1822	}
				1823	return io_cqring_event_overflow(ctx, user_data, res, cflags);
				1824	}
				1825
				1826	static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
				1827	{
				1828	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
				1829	}
				1830
				1831	static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
				1832	s32 res, u32 cflags)
				1833	{
				1834	ctx->cq_extra++;
				1835	return __io_fill_cqe(ctx, user_data, res, cflags);
				1836	}
				1837
				1838	static void io_req_complete_post(struct io_kiocb *req, s32 res,
				1839	u32 cflags)
				1840	{
				1841	struct io_ring_ctx *ctx = req->ctx;
				1842
				1843	spin_lock(&ctx->completion_lock);
				1844	__io_fill_cqe(ctx, req->user_data, res, cflags);
				1845	/*
				1846	* If we're the last reference to this request, add to our locked
				1847	* free_list cache.
				1848	*/
				1849	if (req_ref_put_and_test(req)) {
				1850	if (req->flags & (REQ_F_LINK \| REQ_F_HARDLINK)) {
				1851	if (req->flags & IO_DISARM_MASK)
				1852	io_disarm_next(req);
				1853	if (req->link) {
				1854	io_req_task_queue(req->link);
				1855	req->link = NULL;
				1856	}
				1857	}
				1858	io_dismantle_req(req);
				1859	io_put_task(req->task, 1);
				1860	list_add(&req->inflight_entry, &ctx->locked_free_list);
				1861	ctx->locked_free_nr++;
				1862	} else {
				1863	if (!percpu_ref_tryget(&ctx->refs))
				1864	req = NULL;
				1865	}
				1866	io_commit_cqring(ctx);
				1867	spin_unlock(&ctx->completion_lock);
				1868
				1869	if (req) {
				1870	io_cqring_ev_posted(ctx);
				1871	percpu_ref_put(&ctx->refs);
				1872	}
				1873	}
				1874
				1875	static inline bool io_req_needs_clean(struct io_kiocb *req)
				1876	{
				1877	return req->flags & IO_REQ_CLEAN_FLAGS;
				1878	}
				1879
				1880	static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
				1881	u32 cflags)
				1882	{
				1883	if (io_req_needs_clean(req))
				1884	io_clean_op(req);
				1885	req->result = res;
				1886	req->compl.cflags = cflags;
				1887	req->flags \|= REQ_F_COMPLETE_INLINE;
				1888	}
				1889
				1890	static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
				1891	s32 res, u32 cflags)
				1892	{
				1893	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
				1894	io_req_complete_state(req, res, cflags);
				1895	else
				1896	io_req_complete_post(req, res, cflags);
				1897	}
				1898
				1899	static inline void io_req_complete(struct io_kiocb *req, s32 res)
				1900	{
				1901	__io_req_complete(req, 0, res, 0);
				1902	}
				1903
				1904	static void io_req_complete_failed(struct io_kiocb *req, s32 res)
				1905	{
				1906	req_set_fail(req);
				1907	io_req_complete_post(req, res, 0);
				1908	}
				1909
				1910	static void io_req_complete_fail_submit(struct io_kiocb *req)
				1911	{
				1912	/*
				1913	* We don't submit, fail them all, for that replace hardlinks with
				1914	* normal links. Extra REQ_F_LINK is tolerated.
				1915	*/
				1916	req->flags &= ~REQ_F_HARDLINK;
				1917	req->flags \|= REQ_F_LINK;
				1918	io_req_complete_failed(req, req->result);
				1919	}
				1920
				1921	/*
				1922	* Don't initialise the fields below on every allocation, but do that in
				1923	* advance and keep them valid across allocations.
				1924	*/
				1925	static void io_preinit_req(struct io_kiocb req, struct io_ring_ctx ctx)
				1926	{
				1927	req->ctx = ctx;
				1928	req->link = NULL;
				1929	req->async_data = NULL;
				1930	/* not necessary, but safer to zero */
				1931	req->result = 0;
				1932	}
				1933
				1934	static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
				1935	struct io_submit_state *state)
				1936	{
				1937	spin_lock(&ctx->completion_lock);
				1938	list_splice_init(&ctx->locked_free_list, &state->free_list);
				1939	ctx->locked_free_nr = 0;
				1940	spin_unlock(&ctx->completion_lock);
				1941	}
				1942
				1943	/* Returns true IFF there are requests in the cache */
				1944	static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
				1945	{
				1946	struct io_submit_state *state = &ctx->submit_state;
				1947	int nr;
				1948
				1949	/*
				1950	* If we have more than a batch's worth of requests in our IRQ side
				1951	* locked cache, grab the lock and move them over to our submission
				1952	* side cache.
				1953	*/
				1954	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
				1955	io_flush_cached_locked_reqs(ctx, state);
				1956
				1957	nr = state->free_reqs;
				1958	while (!list_empty(&state->free_list)) {
				1959	struct io_kiocb *req = list_first_entry(&state->free_list,
				1960	struct io_kiocb, inflight_entry);
				1961
				1962	list_del(&req->inflight_entry);
				1963	state->reqs[nr++] = req;
				1964	if (nr == ARRAY_SIZE(state->reqs))
				1965	break;
				1966	}
				1967
				1968	state->free_reqs = nr;
				1969	return nr != 0;
				1970	}
				1971
				1972	/*
				1973	* A request might get retired back into the request caches even before opcode
				1974	* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
				1975	* Because of that, io_alloc_req() should be called only under ->uring_lock
				1976	* and with extra caution to not get a request that is still worked on.
				1977	*/
				1978	static struct io_kiocb io_alloc_req(struct io_ring_ctx ctx)
				1979	__must_hold(&ctx->uring_lock)
				1980	{
				1981	struct io_submit_state *state = &ctx->submit_state;
				1982	gfp_t gfp = GFP_KERNEL \| __GFP_NOWARN;
				1983	int ret, i;
				1984
				1985	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
				1986
				1987	if (likely(state->free_reqs \|\| io_flush_cached_reqs(ctx)))
				1988	goto got_req;
				1989
				1990	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
				1991	state->reqs);
				1992
				1993	/*
				1994	* Bulk alloc is all-or-nothing. If we fail to get a batch,
				1995	* retry single alloc to be on the safe side.
				1996	*/
				1997	if (unlikely(ret <= 0)) {
				1998	state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
				1999	if (!state->reqs[0])
				2000	return NULL;
				2001	ret = 1;
				2002	}
				2003
				2004	for (i = 0; i < ret; i++)
				2005	io_preinit_req(state->reqs[i], ctx);
				2006	state->free_reqs = ret;
				2007	got_req:
				2008	state->free_reqs--;
				2009	return state->reqs[state->free_reqs];
				2010	}
				2011
				2012	static inline void io_put_file(struct file *file)
				2013	{
				2014	if (file)
				2015	fput(file);
				2016	}
				2017
				2018	static void io_dismantle_req(struct io_kiocb *req)
				2019	{
				2020	unsigned int flags = req->flags;
				2021
				2022	if (io_req_needs_clean(req))
				2023	io_clean_op(req);
				2024	if (!(flags & REQ_F_FIXED_FILE))
				2025	io_put_file(req->file);
				2026	if (req->fixed_rsrc_refs)
				2027	percpu_ref_put(req->fixed_rsrc_refs);
				2028	if (req->async_data) {
				2029	kfree(req->async_data);
				2030	req->async_data = NULL;
				2031	}
				2032	}
				2033
				2034	static void __io_free_req(struct io_kiocb *req)
				2035	{
				2036	struct io_ring_ctx *ctx = req->ctx;
				2037
				2038	io_dismantle_req(req);
				2039	io_put_task(req->task, 1);
				2040
				2041	spin_lock(&ctx->completion_lock);
				2042	list_add(&req->inflight_entry, &ctx->locked_free_list);
				2043	ctx->locked_free_nr++;
				2044	spin_unlock(&ctx->completion_lock);
				2045
				2046	percpu_ref_put(&ctx->refs);
				2047	}
				2048
				2049	static inline void io_remove_next_linked(struct io_kiocb *req)
				2050	{
				2051	struct io_kiocb *nxt = req->link;
				2052
				2053	req->link = nxt->link;
				2054	nxt->link = NULL;
				2055	}
				2056
				2057	static bool io_kill_linked_timeout(struct io_kiocb *req)
				2058	__must_hold(&req->ctx->completion_lock)
				2059	__must_hold(&req->ctx->timeout_lock)
				2060	{
				2061	struct io_kiocb *link = req->link;
				2062
				2063	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
				2064	struct io_timeout_data *io = link->async_data;
				2065
				2066	io_remove_next_linked(req);
				2067	link->timeout.head = NULL;
				2068	if (hrtimer_try_to_cancel(&io->timer) != -1) {
				2069	list_del(&link->timeout.list);
				2070	io_fill_cqe_req(link, -ECANCELED, 0);
				2071	io_put_req_deferred(link);
				2072	return true;
				2073	}
				2074	}
				2075	return false;
				2076	}
				2077
				2078	static void io_fail_links(struct io_kiocb *req)
				2079	__must_hold(&req->ctx->completion_lock)
				2080	{
				2081	struct io_kiocb nxt, link = req->link;
				2082
				2083	req->link = NULL;
				2084	while (link) {
				2085	long res = -ECANCELED;
				2086
				2087	if (link->flags & REQ_F_FAIL)
				2088	res = link->result;
				2089
				2090	nxt = link->link;
				2091	link->link = NULL;
				2092
				2093	trace_io_uring_fail_link(req, link);
				2094	io_fill_cqe_req(link, res, 0);
				2095	io_put_req_deferred(link);
				2096	link = nxt;
				2097	}
				2098	}
				2099
				2100	static bool io_disarm_next(struct io_kiocb *req)
				2101	__must_hold(&req->ctx->completion_lock)
				2102	{
				2103	bool posted = false;
				2104
				2105	if (req->flags & REQ_F_ARM_LTIMEOUT) {
				2106	struct io_kiocb *link = req->link;
				2107
				2108	req->flags &= ~REQ_F_ARM_LTIMEOUT;
				2109	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
				2110	io_remove_next_linked(req);
				2111	io_fill_cqe_req(link, -ECANCELED, 0);
				2112	io_put_req_deferred(link);
				2113	posted = true;
				2114	}
				2115	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
				2116	struct io_ring_ctx *ctx = req->ctx;
				2117
				2118	spin_lock_irq(&ctx->timeout_lock);
				2119	posted = io_kill_linked_timeout(req);
				2120	spin_unlock_irq(&ctx->timeout_lock);
				2121	}
				2122	if (unlikely((req->flags & REQ_F_FAIL) &&
				2123	!(req->flags & REQ_F_HARDLINK))) {
				2124	posted \|= (req->link != NULL);
				2125	io_fail_links(req);
				2126	}
				2127	return posted;
				2128	}
				2129
				2130	static struct io_kiocb __io_req_find_next(struct io_kiocb req)
				2131	{
				2132	struct io_kiocb *nxt;
				2133
				2134	/*
				2135	* If LINK is set, we have dependent requests in this chain. If we
				2136	* didn't fail this request, queue the first one up, moving any other
				2137	* dependencies to the next request. In case of failure, fail the rest
				2138	* of the chain.
				2139	*/
				2140	if (req->flags & IO_DISARM_MASK) {
				2141	struct io_ring_ctx *ctx = req->ctx;
				2142	bool posted;
				2143
				2144	spin_lock(&ctx->completion_lock);
				2145	posted = io_disarm_next(req);
				2146	if (posted)
				2147	io_commit_cqring(req->ctx);
				2148	spin_unlock(&ctx->completion_lock);
				2149	if (posted)
				2150	io_cqring_ev_posted(ctx);
				2151	}
				2152	nxt = req->link;
				2153	req->link = NULL;
				2154	return nxt;
				2155	}
				2156
				2157	static inline struct io_kiocb io_req_find_next(struct io_kiocb req)
				2158	{
				2159	if (likely(!(req->flags & (REQ_F_LINK\|REQ_F_HARDLINK))))
				2160	return NULL;
				2161	return __io_req_find_next(req);
				2162	}
				2163
				2164	static void ctx_flush_and_put(struct io_ring_ctx ctx, bool locked)
				2165	{
				2166	if (!ctx)
				2167	return;
				2168	if (*locked) {
				2169	if (ctx->submit_state.compl_nr)
				2170	io_submit_flush_completions(ctx);
				2171	mutex_unlock(&ctx->uring_lock);
				2172	*locked = false;
				2173	}
				2174	percpu_ref_put(&ctx->refs);
				2175	}
				2176
				2177	static void tctx_task_work(struct callback_head *cb)
				2178	{
				2179	bool locked = false;
				2180	struct io_ring_ctx *ctx = NULL;
				2181	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
				2182	task_work);
				2183
				2184	while (1) {
				2185	struct io_wq_work_node *node;
				2186
				2187	if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
				2188	io_submit_flush_completions(ctx);
				2189
				2190	spin_lock_irq(&tctx->task_lock);
				2191	node = tctx->task_list.first;
				2192	INIT_WQ_LIST(&tctx->task_list);
				2193	if (!node)
				2194	tctx->task_running = false;
				2195	spin_unlock_irq(&tctx->task_lock);
				2196	if (!node)
				2197	break;
				2198
				2199	do {
				2200	struct io_wq_work_node *next = node->next;
				2201	struct io_kiocb *req = container_of(node, struct io_kiocb,
				2202	io_task_work.node);
				2203
				2204	if (req->ctx != ctx) {
				2205	ctx_flush_and_put(ctx, &locked);
				2206	ctx = req->ctx;
				2207	/* if not contended, grab and improve batching */
				2208	locked = mutex_trylock(&ctx->uring_lock);
				2209	percpu_ref_get(&ctx->refs);
				2210	}
				2211	req->io_task_work.func(req, &locked);
				2212	node = next;
				2213	} while (node);
				2214
				2215	cond_resched();
				2216	}
				2217
				2218	ctx_flush_and_put(ctx, &locked);
				2219
				2220	/* relaxed read is enough as only the task itself sets ->in_idle */
				2221	if (unlikely(atomic_read(&tctx->in_idle)))
				2222	io_uring_drop_tctx_refs(current);
				2223	}
				2224
				2225	static void io_req_task_work_add(struct io_kiocb *req)
				2226	{
				2227	struct task_struct *tsk = req->task;
				2228	struct io_uring_task *tctx = tsk->io_uring;
				2229	enum task_work_notify_mode notify;
				2230	struct io_wq_work_node *node;
				2231	unsigned long flags;
				2232	bool running;
				2233
				2234	WARN_ON_ONCE(!tctx);
				2235
				2236	spin_lock_irqsave(&tctx->task_lock, flags);
				2237	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
				2238	running = tctx->task_running;
				2239	if (!running)
				2240	tctx->task_running = true;
				2241	spin_unlock_irqrestore(&tctx->task_lock, flags);
				2242
				2243	/* task_work already pending, we're done */
				2244	if (running)
				2245	return;
				2246
				2247	/*
				2248	* SQPOLL kernel thread doesn't need notification, just a wakeup. For
				2249	* all other cases, use TWA_SIGNAL unconditionally to ensure we're
				2250	* processing task_work. There's no reliable way to tell if TWA_RESUME
				2251	* will do the job.
				2252	*/
				2253	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
				2254	if (!task_work_add(tsk, &tctx->task_work, notify)) {
				2255	wake_up_process(tsk);
				2256	return;
				2257	}
				2258
				2259	spin_lock_irqsave(&tctx->task_lock, flags);
				2260	tctx->task_running = false;
				2261	node = tctx->task_list.first;
				2262	INIT_WQ_LIST(&tctx->task_list);
				2263	spin_unlock_irqrestore(&tctx->task_lock, flags);
				2264
				2265	while (node) {
				2266	req = container_of(node, struct io_kiocb, io_task_work.node);
				2267	node = node->next;
				2268	if (llist_add(&req->io_task_work.fallback_node,
				2269	&req->ctx->fallback_llist))
				2270	schedule_delayed_work(&req->ctx->fallback_work, 1);
				2271	}
				2272	}
				2273
				2274	static void io_req_task_cancel(struct io_kiocb req, bool locked)
				2275	{
				2276	struct io_ring_ctx *ctx = req->ctx;
				2277
				2278	/* not needed for normal modes, but SQPOLL depends on it */
				2279	io_tw_lock(ctx, locked);
				2280	io_req_complete_failed(req, req->result);
				2281	}
				2282
				2283	static void io_req_task_submit(struct io_kiocb req, bool locked)
				2284	{
				2285	struct io_ring_ctx *ctx = req->ctx;
				2286
				2287	io_tw_lock(ctx, locked);
				2288	/* req->task == current here, checking PF_EXITING is safe */
				2289	if (likely(!(req->task->flags & PF_EXITING)))
				2290	__io_queue_sqe(req);
				2291	else
				2292	io_req_complete_failed(req, -EFAULT);
				2293	}
				2294
				2295	static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
				2296	{
				2297	req->result = ret;
				2298	req->io_task_work.func = io_req_task_cancel;
				2299	io_req_task_work_add(req);
				2300	}
				2301
				2302	static void io_req_task_queue(struct io_kiocb *req)
				2303	{
				2304	req->io_task_work.func = io_req_task_submit;
				2305	io_req_task_work_add(req);
				2306	}
				2307
				2308	static void io_req_task_queue_reissue(struct io_kiocb *req)
				2309	{
				2310	req->io_task_work.func = io_queue_async_work;
				2311	io_req_task_work_add(req);
				2312	}
				2313
				2314	static inline void io_queue_next(struct io_kiocb *req)
				2315	{
				2316	struct io_kiocb *nxt = io_req_find_next(req);
				2317
				2318	if (nxt)
				2319	io_req_task_queue(nxt);
				2320	}
				2321
				2322	static void io_free_req(struct io_kiocb *req)
				2323	{
				2324	io_queue_next(req);
				2325	__io_free_req(req);
				2326	}
				2327
				2328	static void io_free_req_work(struct io_kiocb req, bool locked)
				2329	{
				2330	io_free_req(req);
				2331	}
				2332
				2333	struct req_batch {
				2334	struct task_struct *task;
				2335	int task_refs;
				2336	int ctx_refs;
				2337	};
				2338
				2339	static inline void io_init_req_batch(struct req_batch *rb)
				2340	{
				2341	rb->task_refs = 0;
				2342	rb->ctx_refs = 0;
				2343	rb->task = NULL;
				2344	}
				2345
				2346	static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
				2347	struct req_batch *rb)
				2348	{
				2349	if (rb->ctx_refs)
				2350	percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
				2351	if (rb->task)
				2352	io_put_task(rb->task, rb->task_refs);
				2353	}
				2354
				2355	static void io_req_free_batch(struct req_batch rb, struct io_kiocb req,
				2356	struct io_submit_state *state)
				2357	{
				2358	io_queue_next(req);
				2359	io_dismantle_req(req);
				2360
				2361	if (req->task != rb->task) {
				2362	if (rb->task)
				2363	io_put_task(rb->task, rb->task_refs);
				2364	rb->task = req->task;
				2365	rb->task_refs = 0;
				2366	}
				2367	rb->task_refs++;
				2368	rb->ctx_refs++;
				2369
				2370	if (state->free_reqs != ARRAY_SIZE(state->reqs))
				2371	state->reqs[state->free_reqs++] = req;
				2372	else
				2373	list_add(&req->inflight_entry, &state->free_list);
				2374	}
				2375
				2376	static void io_submit_flush_completions(struct io_ring_ctx *ctx)
				2377	__must_hold(&ctx->uring_lock)
				2378	{
				2379	struct io_submit_state *state = &ctx->submit_state;
				2380	int i, nr = state->compl_nr;
				2381	struct req_batch rb;
				2382
				2383	spin_lock(&ctx->completion_lock);
				2384	for (i = 0; i < nr; i++) {
				2385	struct io_kiocb *req = state->compl_reqs[i];
				2386
				2387	__io_fill_cqe(ctx, req->user_data, req->result,
				2388	req->compl.cflags);
				2389	}
				2390	io_commit_cqring(ctx);
				2391	spin_unlock(&ctx->completion_lock);
				2392	io_cqring_ev_posted(ctx);
				2393
				2394	io_init_req_batch(&rb);
				2395	for (i = 0; i < nr; i++) {
				2396	struct io_kiocb *req = state->compl_reqs[i];
				2397
				2398	if (req_ref_put_and_test(req))
				2399	io_req_free_batch(&rb, req, &ctx->submit_state);
				2400	}
				2401
				2402	io_req_free_batch_finish(ctx, &rb);
				2403	state->compl_nr = 0;
				2404	}
				2405
				2406	/*
				2407	* Drop reference to request, return next in chain (if there is one) if this
				2408	* was the last reference to this request.
				2409	*/
				2410	static inline struct io_kiocb io_put_req_find_next(struct io_kiocb req)
				2411	{
				2412	struct io_kiocb *nxt = NULL;
				2413
				2414	if (req_ref_put_and_test(req)) {
				2415	nxt = io_req_find_next(req);
				2416	__io_free_req(req);
				2417	}
				2418	return nxt;
				2419	}
				2420
				2421	static inline void io_put_req(struct io_kiocb *req)
				2422	{
				2423	if (req_ref_put_and_test(req))
				2424	io_free_req(req);
				2425	}
				2426
				2427	static inline void io_put_req_deferred(struct io_kiocb *req)
				2428	{
				2429	if (req_ref_put_and_test(req)) {
				2430	req->io_task_work.func = io_free_req_work;
				2431	io_req_task_work_add(req);
				2432	}
				2433	}
				2434
				2435	static unsigned io_cqring_events(struct io_ring_ctx *ctx)
				2436	{
				2437	/* See comment at the top of this file */
				2438	smp_rmb();
				2439	return __io_cqring_events(ctx);
				2440	}
				2441
				2442	static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
				2443	{
				2444	struct io_rings *rings = ctx->rings;
				2445
				2446	/* make sure SQ entry isn't read before tail */
				2447	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
				2448	}
				2449
				2450	static unsigned int io_put_kbuf(struct io_kiocb req, struct io_buffer kbuf)
				2451	{
				2452	unsigned int cflags;
				2453
				2454	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
				2455	cflags \|= IORING_CQE_F_BUFFER;
				2456	req->flags &= ~REQ_F_BUFFER_SELECTED;
				2457	kfree(kbuf);
				2458	return cflags;
				2459	}
				2460
				2461	static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
				2462	{
				2463	struct io_buffer *kbuf;
				2464
				2465	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
				2466	return 0;
				2467	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
				2468	return io_put_kbuf(req, kbuf);
				2469	}
				2470
				2471	static inline bool io_run_task_work(void)
				2472	{
				2473	if (test_thread_flag(TIF_NOTIFY_SIGNAL) \|\| current->task_works) {
				2474	__set_current_state(TASK_RUNNING);
				2475	tracehook_notify_signal();
				2476	return true;
				2477	}
				2478
				2479	return false;
				2480	}
				2481
				2482	/*
				2483	* Find and free completed poll iocbs
				2484	*/
				2485	static void io_iopoll_complete(struct io_ring_ctx ctx, unsigned int nr_events,
				2486	struct list_head *done)
				2487	{
				2488	struct req_batch rb;
				2489	struct io_kiocb *req;
				2490
				2491	/* order with ->result store in io_complete_rw_iopoll() */
				2492	smp_rmb();
				2493
				2494	io_init_req_batch(&rb);
				2495	while (!list_empty(done)) {
Pavel Begunkov	a5edbe2	2023-01-14 09:14:03 -0700	[diff] [blame]	2496	struct io_uring_cqe *cqe;
				2497	unsigned cflags;
				2498
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2499	req = list_first_entry(done, struct io_kiocb, inflight_entry);
				2500	list_del(&req->inflight_entry);
Pavel Begunkov	a5edbe2	2023-01-14 09:14:03 -0700	[diff] [blame]	2501	cflags = io_put_rw_kbuf(req);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2502	(*nr_events)++;
				2503
Pavel Begunkov	a5edbe2	2023-01-14 09:14:03 -0700	[diff] [blame]	2504	cqe = io_get_cqe(ctx);
				2505	if (cqe) {
				2506	WRITE_ONCE(cqe->user_data, req->user_data);
				2507	WRITE_ONCE(cqe->res, req->result);
				2508	WRITE_ONCE(cqe->flags, cflags);
				2509	} else {
				2510	spin_lock(&ctx->completion_lock);
				2511	io_cqring_event_overflow(ctx, req->user_data,
				2512	req->result, cflags);
				2513	spin_unlock(&ctx->completion_lock);
				2514	}
				2515
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2516	if (req_ref_put_and_test(req))
				2517	io_req_free_batch(&rb, req, &ctx->submit_state);
				2518	}
				2519
				2520	io_commit_cqring(ctx);
				2521	io_cqring_ev_posted_iopoll(ctx);
				2522	io_req_free_batch_finish(ctx, &rb);
				2523	}
				2524
				2525	static int io_do_iopoll(struct io_ring_ctx ctx, unsigned int nr_events,
				2526	long min)
				2527	{
				2528	struct io_kiocb req, tmp;
				2529	LIST_HEAD(done);
				2530	bool spin;
				2531
				2532	/*
				2533	* Only spin for completions if we don't have multiple devices hanging
				2534	* off our complete list, and we're under the requested amount.
				2535	*/
				2536	spin = !ctx->poll_multi_queue && *nr_events < min;
				2537
				2538	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
				2539	struct kiocb *kiocb = &req->rw.kiocb;
				2540	int ret;
				2541
				2542	/*
				2543	* Move completed and retryable entries to our local lists.
				2544	* If we find a request that requires polling, break out
				2545	* and complete those lists first, if we have entries there.
				2546	*/
				2547	if (READ_ONCE(req->iopoll_completed)) {
				2548	list_move_tail(&req->inflight_entry, &done);
				2549	continue;
				2550	}
				2551	if (!list_empty(&done))
				2552	break;
				2553
				2554	ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
				2555	if (unlikely(ret < 0))
				2556	return ret;
				2557	else if (ret)
				2558	spin = false;
				2559
				2560	/* iopoll may have completed current req */
				2561	if (READ_ONCE(req->iopoll_completed))
				2562	list_move_tail(&req->inflight_entry, &done);
				2563	}
				2564
				2565	if (!list_empty(&done))
				2566	io_iopoll_complete(ctx, nr_events, &done);
				2567
				2568	return 0;
				2569	}
				2570
				2571	/*
				2572	* We can't just wait for polled events to come to us, we have to actively
				2573	* find and complete them.
				2574	*/
				2575	static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
				2576	{
				2577	if (!(ctx->flags & IORING_SETUP_IOPOLL))
				2578	return;
				2579
				2580	mutex_lock(&ctx->uring_lock);
				2581	while (!list_empty(&ctx->iopoll_list)) {
				2582	unsigned int nr_events = 0;
				2583
				2584	io_do_iopoll(ctx, &nr_events, 0);
				2585
				2586	/* let it sleep and repeat later if can't complete a request */
				2587	if (nr_events == 0)
				2588	break;
				2589	/*
				2590	* Ensure we allow local-to-the-cpu processing to take place,
				2591	* in this case we need to ensure that we reap all events.
				2592	* Also let task_work, etc. to progress by releasing the mutex
				2593	*/
				2594	if (need_resched()) {
				2595	mutex_unlock(&ctx->uring_lock);
				2596	cond_resched();
				2597	mutex_lock(&ctx->uring_lock);
				2598	}
				2599	}
				2600	mutex_unlock(&ctx->uring_lock);
				2601	}
				2602
				2603	static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
				2604	{
				2605	unsigned int nr_events = 0;
				2606	int ret = 0;
				2607
				2608	/*
				2609	* We disallow the app entering submit/complete with polling, but we
				2610	* still need to lock the ring to prevent racing with polled issue
				2611	* that got punted to a workqueue.
				2612	*/
				2613	mutex_lock(&ctx->uring_lock);
				2614	/*
				2615	* Don't enter poll loop if we already have events pending.
				2616	* If we do, we can potentially be spinning for commands that
				2617	* already triggered a CQE (eg in error).
				2618	*/
				2619	if (test_bit(0, &ctx->check_cq_overflow))
				2620	__io_cqring_overflow_flush(ctx, false);
				2621	if (io_cqring_events(ctx))
				2622	goto out;
				2623	do {
				2624	/*
				2625	* If a submit got punted to a workqueue, we can have the
				2626	* application entering polling for a command before it gets
				2627	* issued. That app will hold the uring_lock for the duration
				2628	* of the poll right here, so we need to take a breather every
				2629	* now and then to ensure that the issue has a chance to add
				2630	* the poll to the issued list. Otherwise we can spin here
				2631	* forever, while the workqueue is stuck trying to acquire the
				2632	* very same mutex.
				2633	*/
				2634	if (list_empty(&ctx->iopoll_list)) {
				2635	u32 tail = ctx->cached_cq_tail;
				2636
				2637	mutex_unlock(&ctx->uring_lock);
				2638	io_run_task_work();
				2639	mutex_lock(&ctx->uring_lock);
				2640
				2641	/* some requests don't go through iopoll_list */
				2642	if (tail != ctx->cached_cq_tail \|\|
				2643	list_empty(&ctx->iopoll_list))
				2644	break;
				2645	}
				2646	ret = io_do_iopoll(ctx, &nr_events, min);
				2647	} while (!ret && nr_events < min && !need_resched());
				2648	out:
				2649	mutex_unlock(&ctx->uring_lock);
				2650	return ret;
				2651	}
				2652
				2653	static void kiocb_end_write(struct io_kiocb *req)
				2654	{
				2655	/*
				2656	* Tell lockdep we inherited freeze protection from submission
				2657	* thread.
				2658	*/
				2659	if (req->flags & REQ_F_ISREG) {
				2660	struct super_block *sb = file_inode(req->file)->i_sb;
				2661
				2662	__sb_writers_acquired(sb, SB_FREEZE_WRITE);
				2663	sb_end_write(sb);
				2664	}
				2665	}
				2666
				2667	#ifdef CONFIG_BLOCK
				2668	static bool io_resubmit_prep(struct io_kiocb *req)
				2669	{
				2670	struct io_async_rw *rw = req->async_data;
				2671
				2672	if (!rw)
				2673	return !io_req_prep_async(req);
				2674	iov_iter_restore(&rw->iter, &rw->iter_state);
				2675	return true;
				2676	}
				2677
				2678	static bool io_rw_should_reissue(struct io_kiocb *req)
				2679	{
				2680	umode_t mode = file_inode(req->file)->i_mode;
				2681	struct io_ring_ctx *ctx = req->ctx;
				2682
				2683	if (!S_ISBLK(mode) && !S_ISREG(mode))
				2684	return false;
				2685	if ((req->flags & REQ_F_NOWAIT) \|\| (io_wq_current_is_worker() &&
				2686	!(ctx->flags & IORING_SETUP_IOPOLL)))
				2687	return false;
				2688	/*
				2689	* If ref is dying, we might be running poll reap from the exit work.
				2690	* Don't attempt to reissue from that path, just let it fail with
				2691	* -EAGAIN.
				2692	*/
				2693	if (percpu_ref_is_dying(&ctx->refs))
				2694	return false;
				2695	/*
				2696	* Play it safe and assume not safe to re-import and reissue if we're
				2697	* not in the original thread group (or in task context).
				2698	*/
				2699	if (!same_thread_group(req->task, current) \|\| !in_task())
				2700	return false;
				2701	return true;
				2702	}
				2703	#else
				2704	static bool io_resubmit_prep(struct io_kiocb *req)
				2705	{
				2706	return false;
				2707	}
				2708	static bool io_rw_should_reissue(struct io_kiocb *req)
				2709	{
				2710	return false;
				2711	}
				2712	#endif
				2713
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	2714	/*
				2715	* Trigger the notifications after having done some IO, and finish the write
				2716	* accounting, if any.
				2717	*/
				2718	static void io_req_io_end(struct io_kiocb *req)
				2719	{
				2720	struct io_rw *rw = &req->rw;
				2721
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	2722	if (rw->kiocb.ki_flags & IOCB_WRITE) {
				2723	kiocb_end_write(req);
				2724	fsnotify_modify(req->file);
				2725	} else {
				2726	fsnotify_access(req->file);
				2727	}
				2728	}
				2729
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2730	static bool __io_complete_rw_common(struct io_kiocb *req, long res)
				2731	{
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2732	if (res != req->result) {
				2733	if ((res == -EAGAIN \|\| res == -EOPNOTSUPP) &&
				2734	io_rw_should_reissue(req)) {
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	2735	/*
				2736	* Reissue will start accounting again, finish the
				2737	* current cycle.
				2738	*/
				2739	io_req_io_end(req);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2740	req->flags \|= REQ_F_REISSUE;
				2741	return true;
				2742	}
				2743	req_set_fail(req);
				2744	req->result = res;
				2745	}
				2746	return false;
				2747	}
				2748
Harshit Mogalapalli	947583e	2023-01-10 08:46:47 -0800	[diff] [blame]	2749	static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2750	{
				2751	struct io_async_rw *io = req->async_data;
				2752
				2753	/* add previously done IO, if any */
				2754	if (io && io->bytes_done > 0) {
				2755	if (res < 0)
				2756	res = io->bytes_done;
				2757	else
				2758	res += io->bytes_done;
				2759	}
				2760	return res;
				2761	}
				2762
				2763	static void io_req_task_complete(struct io_kiocb req, bool locked)
				2764	{
				2765	unsigned int cflags = io_put_rw_kbuf(req);
				2766	int res = req->result;
				2767
				2768	if (*locked) {
				2769	struct io_ring_ctx *ctx = req->ctx;
				2770	struct io_submit_state *state = &ctx->submit_state;
				2771
				2772	io_req_complete_state(req, res, cflags);
				2773	state->compl_reqs[state->compl_nr++] = req;
				2774	if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
				2775	io_submit_flush_completions(ctx);
				2776	} else {
				2777	io_req_complete_post(req, res, cflags);
				2778	}
				2779	}
				2780
Jens Axboe	9c50578	2023-01-21 13:38:51 -0700	[diff] [blame]	2781	static void io_req_rw_complete(struct io_kiocb req, bool locked)
				2782	{
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	2783	io_req_io_end(req);
Jens Axboe	9c50578	2023-01-21 13:38:51 -0700	[diff] [blame]	2784	io_req_task_complete(req, locked);
				2785	}
				2786
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2787	static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
				2788	{
				2789	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
				2790
				2791	if (__io_complete_rw_common(req, res))
				2792	return;
				2793	req->result = io_fixup_rw_res(req, res);
Jens Axboe	9c50578	2023-01-21 13:38:51 -0700	[diff] [blame]	2794	req->io_task_work.func = io_req_rw_complete;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2795	io_req_task_work_add(req);
				2796	}
				2797
				2798	static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
				2799	{
				2800	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
				2801
				2802	if (kiocb->ki_flags & IOCB_WRITE)
				2803	kiocb_end_write(req);
				2804	if (unlikely(res != req->result)) {
				2805	if (res == -EAGAIN && io_rw_should_reissue(req)) {
				2806	req->flags \|= REQ_F_REISSUE;
				2807	return;
				2808	}
				2809	}
				2810
				2811	WRITE_ONCE(req->result, res);
				2812	/* order with io_iopoll_complete() checking ->result */
				2813	smp_wmb();
				2814	WRITE_ONCE(req->iopoll_completed, 1);
				2815	}
				2816
				2817	/*
				2818	* After the iocb has been issued, it's safe to be found on the poll list.
				2819	* Adding the kiocb to the list AFTER submission ensures that we don't
				2820	* find it from a io_do_iopoll() thread before the issuer is done
				2821	* accessing the kiocb cookie.
				2822	*/
				2823	static void io_iopoll_req_issued(struct io_kiocb *req)
				2824	{
				2825	struct io_ring_ctx *ctx = req->ctx;
				2826	const bool in_async = io_wq_current_is_worker();
				2827
				2828	/* workqueue context doesn't hold uring_lock, grab it now */
				2829	if (unlikely(in_async))
				2830	mutex_lock(&ctx->uring_lock);
				2831
				2832	/*
				2833	* Track whether we have multiple files in our lists. This will impact
				2834	* how we do polling eventually, not spinning if we're on potentially
				2835	* different devices.
				2836	*/
				2837	if (list_empty(&ctx->iopoll_list)) {
				2838	ctx->poll_multi_queue = false;
				2839	} else if (!ctx->poll_multi_queue) {
				2840	struct io_kiocb *list_req;
				2841	unsigned int queue_num0, queue_num1;
				2842
				2843	list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
				2844	inflight_entry);
				2845
				2846	if (list_req->file != req->file) {
				2847	ctx->poll_multi_queue = true;
				2848	} else {
				2849	queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
				2850	queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
				2851	if (queue_num0 != queue_num1)
				2852	ctx->poll_multi_queue = true;
				2853	}
				2854	}
				2855
				2856	/*
				2857	* For fast devices, IO may have already completed. If it has, add
				2858	* it to the front so we find it first.
				2859	*/
				2860	if (READ_ONCE(req->iopoll_completed))
				2861	list_add(&req->inflight_entry, &ctx->iopoll_list);
				2862	else
				2863	list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
				2864
				2865	if (unlikely(in_async)) {
				2866	/*
				2867	* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
				2868	* in sq thread task context or in io worker task context. If
				2869	* current task context is sq thread, we don't need to check
				2870	* whether should wake up sq thread.
				2871	*/
				2872	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
				2873	wq_has_sleeper(&ctx->sq_data->wait))
				2874	wake_up(&ctx->sq_data->wait);
				2875
				2876	mutex_unlock(&ctx->uring_lock);
				2877	}
				2878	}
				2879
				2880	static bool io_bdev_nowait(struct block_device *bdev)
				2881	{
				2882	return !bdev \|\| blk_queue_nowait(bdev_get_queue(bdev));
				2883	}
				2884
				2885	/*
				2886	* If we tracked the file through the SCM inflight mechanism, we could support
				2887	* any file. For now, just ensure that anything potentially problematic is done
				2888	* inline.
				2889	*/
				2890	static bool __io_file_supports_nowait(struct file *file, int rw)
				2891	{
				2892	umode_t mode = file_inode(file)->i_mode;
				2893
				2894	if (S_ISBLK(mode)) {
				2895	if (IS_ENABLED(CONFIG_BLOCK) &&
				2896	io_bdev_nowait(I_BDEV(file->f_mapping->host)))
				2897	return true;
				2898	return false;
				2899	}
				2900	if (S_ISSOCK(mode))
				2901	return true;
				2902	if (S_ISREG(mode)) {
				2903	if (IS_ENABLED(CONFIG_BLOCK) &&
				2904	io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
				2905	file->f_op != &io_uring_fops)
				2906	return true;
				2907	return false;
				2908	}
				2909
				2910	/* any ->read/write should understand O_NONBLOCK */
				2911	if (file->f_flags & O_NONBLOCK)
				2912	return true;
				2913
				2914	if (!(file->f_mode & FMODE_NOWAIT))
				2915	return false;
				2916
				2917	if (rw == READ)
				2918	return file->f_op->read_iter != NULL;
				2919
				2920	return file->f_op->write_iter != NULL;
				2921	}
				2922
				2923	static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
				2924	{
				2925	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
				2926	return true;
				2927	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
				2928	return true;
				2929
				2930	return __io_file_supports_nowait(req->file, rw);
				2931	}
				2932
				2933	static int io_prep_rw(struct io_kiocb req, const struct io_uring_sqe sqe,
				2934	int rw)
				2935	{
				2936	struct io_ring_ctx *ctx = req->ctx;
				2937	struct kiocb *kiocb = &req->rw.kiocb;
				2938	struct file *file = req->file;
				2939	unsigned ioprio;
				2940	int ret;
				2941
				2942	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
				2943	req->flags \|= REQ_F_ISREG;
				2944
				2945	kiocb->ki_pos = READ_ONCE(sqe->off);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	2946	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
				2947	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
				2948	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
				2949	if (unlikely(ret))
				2950	return ret;
				2951
				2952	/*
				2953	* If the file is marked O_NONBLOCK, still allow retry for it if it
				2954	* supports async. Otherwise it's impossible to use O_NONBLOCK files
				2955	* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
				2956	*/
				2957	if ((kiocb->ki_flags & IOCB_NOWAIT) \|\|
				2958	((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
				2959	req->flags \|= REQ_F_NOWAIT;
				2960
				2961	ioprio = READ_ONCE(sqe->ioprio);
				2962	if (ioprio) {
				2963	ret = ioprio_check_cap(ioprio);
				2964	if (ret)
				2965	return ret;
				2966
				2967	kiocb->ki_ioprio = ioprio;
				2968	} else
				2969	kiocb->ki_ioprio = get_current_ioprio();
				2970
				2971	if (ctx->flags & IORING_SETUP_IOPOLL) {
				2972	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\|
				2973	!kiocb->ki_filp->f_op->iopoll)
				2974	return -EOPNOTSUPP;
				2975
				2976	kiocb->ki_flags \|= IOCB_HIPRI;
				2977	kiocb->ki_complete = io_complete_rw_iopoll;
				2978	req->iopoll_completed = 0;
				2979	} else {
				2980	if (kiocb->ki_flags & IOCB_HIPRI)
				2981	return -EINVAL;
				2982	kiocb->ki_complete = io_complete_rw;
				2983	}
				2984
				2985	/* used for fixed read/write too - just read unconditionally */
				2986	req->buf_index = READ_ONCE(sqe->buf_index);
				2987	req->imu = NULL;
				2988
				2989	if (req->opcode == IORING_OP_READ_FIXED \|\|
				2990	req->opcode == IORING_OP_WRITE_FIXED) {
				2991	struct io_ring_ctx *ctx = req->ctx;
				2992	u16 index;
				2993
				2994	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
				2995	return -EFAULT;
				2996	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
				2997	req->imu = ctx->user_bufs[index];
				2998	io_req_set_rsrc_node(req);
				2999	}
				3000
				3001	req->rw.addr = READ_ONCE(sqe->addr);
				3002	req->rw.len = READ_ONCE(sqe->len);
				3003	return 0;
				3004	}
				3005
				3006	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
				3007	{
				3008	switch (ret) {
				3009	case -EIOCBQUEUED:
				3010	break;
				3011	case -ERESTARTSYS:
				3012	case -ERESTARTNOINTR:
				3013	case -ERESTARTNOHAND:
				3014	case -ERESTART_RESTARTBLOCK:
				3015	/*
				3016	* We can't just restart the syscall, since previously
				3017	* submitted sqes may already be in progress. Just fail this
				3018	* IO with EINTR.
				3019	*/
				3020	ret = -EINTR;
				3021	fallthrough;
				3022	default:
				3023	kiocb->ki_complete(kiocb, ret, 0);
				3024	}
				3025	}
				3026
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3027	static inline loff_t io_kiocb_update_pos(struct io_kiocb req)
Dylan Yudaken	26b7575	2022-02-22 02:55:02 -0800	[diff] [blame]	3028	{
				3029	struct kiocb *kiocb = &req->rw.kiocb;
				3030
Jens Axboe	ef9f28e	2022-04-11 09:48:30 -0600	[diff] [blame]	3031	if (kiocb->ki_pos != -1)
				3032	return &kiocb->ki_pos;
				3033
				3034	if (!(req->file->f_mode & FMODE_STREAM)) {
				3035	req->flags \|= REQ_F_CUR_POS;
				3036	kiocb->ki_pos = req->file->f_pos;
				3037	return &kiocb->ki_pos;
Dylan Yudaken	26b7575	2022-02-22 02:55:02 -0800	[diff] [blame]	3038	}
Jens Axboe	ef9f28e	2022-04-11 09:48:30 -0600	[diff] [blame]	3039
				3040	kiocb->ki_pos = 0;
				3041	return NULL;
Dylan Yudaken	26b7575	2022-02-22 02:55:02 -0800	[diff] [blame]	3042	}
				3043
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3044	static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
				3045	unsigned int issue_flags)
				3046	{
				3047	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
				3048
				3049	if (req->flags & REQ_F_CUR_POS)
				3050	req->file->f_pos = kiocb->ki_pos;
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	3051	if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
				3052	if (!__io_complete_rw_common(req, ret)) {
				3053	/*
				3054	* Safe to call io_end from here as we're inline
				3055	* from the submission path.
				3056	*/
				3057	io_req_io_end(req);
				3058	__io_req_complete(req, issue_flags,
				3059	io_fixup_rw_res(req, ret),
				3060	io_put_rw_kbuf(req));
				3061	}
				3062	} else {
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3063	io_rw_done(kiocb, ret);
Jens Axboe	8b76b0d	2023-01-22 10:36:37 -0700	[diff] [blame]	3064	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3065
				3066	if (req->flags & REQ_F_REISSUE) {
				3067	req->flags &= ~REQ_F_REISSUE;
				3068	if (io_resubmit_prep(req)) {
				3069	io_req_task_queue_reissue(req);
				3070	} else {
				3071	unsigned int cflags = io_put_rw_kbuf(req);
				3072	struct io_ring_ctx *ctx = req->ctx;
				3073
				3074	ret = io_fixup_rw_res(req, ret);
				3075	req_set_fail(req);
				3076	if (!(issue_flags & IO_URING_F_NONBLOCK)) {
				3077	mutex_lock(&ctx->uring_lock);
				3078	__io_req_complete(req, issue_flags, ret, cflags);
				3079	mutex_unlock(&ctx->uring_lock);
				3080	} else {
				3081	__io_req_complete(req, issue_flags, ret, cflags);
				3082	}
				3083	}
				3084	}
				3085	}
				3086
				3087	static int __io_import_fixed(struct io_kiocb req, int rw, struct iov_iter iter,
				3088	struct io_mapped_ubuf *imu)
				3089	{
				3090	size_t len = req->rw.len;
				3091	u64 buf_end, buf_addr = req->rw.addr;
				3092	size_t offset;
				3093
				3094	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
				3095	return -EFAULT;
				3096	/* not inside the mapped region */
				3097	if (unlikely(buf_addr < imu->ubuf \|\| buf_end > imu->ubuf_end))
				3098	return -EFAULT;
				3099
				3100	/*
				3101	* May not be a start of buffer, set size appropriately
				3102	* and advance us to the beginning.
				3103	*/
				3104	offset = buf_addr - imu->ubuf;
				3105	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
				3106
				3107	if (offset) {
				3108	/*
				3109	* Don't use iov_iter_advance() here, as it's really slow for
				3110	* using the latter parts of a big fixed buffer - it iterates
				3111	* over each segment manually. We can cheat a bit here, because
				3112	* we know that:
				3113	*
				3114	* 1) it's a BVEC iter, we set it up
				3115	* 2) all bvecs are PAGE_SIZE in size, except potentially the
				3116	* first and last bvec
				3117	*
				3118	* So just find our index, and adjust the iterator afterwards.
				3119	* If the offset is within the first bvec (or the whole first
				3120	* bvec, just use iov_iter_advance(). This makes it easier
				3121	* since we can just skip the first segment, which may not
				3122	* be PAGE_SIZE aligned.
				3123	*/
				3124	const struct bio_vec *bvec = imu->bvec;
				3125
				3126	if (offset <= bvec->bv_len) {
				3127	iov_iter_advance(iter, offset);
				3128	} else {
				3129	unsigned long seg_skip;
				3130
				3131	/* skip first vec */
				3132	offset -= bvec->bv_len;
				3133	seg_skip = 1 + (offset >> PAGE_SHIFT);
				3134
				3135	iter->bvec = bvec + seg_skip;
				3136	iter->nr_segs -= seg_skip;
				3137	iter->count -= bvec->bv_len + offset;
				3138	iter->iov_offset = offset & ~PAGE_MASK;
				3139	}
				3140	}
				3141
				3142	return 0;
				3143	}
				3144
				3145	static int io_import_fixed(struct io_kiocb req, int rw, struct iov_iter iter)
				3146	{
				3147	if (WARN_ON_ONCE(!req->imu))
				3148	return -EFAULT;
				3149	return __io_import_fixed(req, rw, iter, req->imu);
				3150	}
				3151
				3152	static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
				3153	{
				3154	if (needs_lock)
				3155	mutex_unlock(&ctx->uring_lock);
				3156	}
				3157
				3158	static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
				3159	{
				3160	/*
				3161	* "Normal" inline submissions always hold the uring_lock, since we
				3162	* grab it from the system call. Same is true for the SQPOLL offload.
				3163	* The only exception is when we've detached the request and issue it
				3164	* from an async worker thread, grab the lock for that case.
				3165	*/
				3166	if (needs_lock)
				3167	mutex_lock(&ctx->uring_lock);
				3168	}
				3169
				3170	static struct io_buffer io_buffer_select(struct io_kiocb req, size_t *len,
				3171	int bgid, struct io_buffer *kbuf,
				3172	bool needs_lock)
				3173	{
				3174	struct io_buffer *head;
				3175
				3176	if (req->flags & REQ_F_BUFFER_SELECTED)
				3177	return kbuf;
				3178
				3179	io_ring_submit_lock(req->ctx, needs_lock);
				3180
				3181	lockdep_assert_held(&req->ctx->uring_lock);
				3182
				3183	head = xa_load(&req->ctx->io_buffers, bgid);
				3184	if (head) {
				3185	if (!list_empty(&head->list)) {
				3186	kbuf = list_last_entry(&head->list, struct io_buffer,
				3187	list);
				3188	list_del(&kbuf->list);
				3189	} else {
				3190	kbuf = head;
				3191	xa_erase(&req->ctx->io_buffers, bgid);
				3192	}
				3193	if (*len > kbuf->len)
				3194	*len = kbuf->len;
				3195	} else {
				3196	kbuf = ERR_PTR(-ENOBUFS);
				3197	}
				3198
				3199	io_ring_submit_unlock(req->ctx, needs_lock);
				3200
				3201	return kbuf;
				3202	}
				3203
				3204	static void __user io_rw_buffer_select(struct io_kiocb req, size_t *len,
				3205	bool needs_lock)
				3206	{
				3207	struct io_buffer *kbuf;
				3208	u16 bgid;
				3209
				3210	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
				3211	bgid = req->buf_index;
				3212	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
				3213	if (IS_ERR(kbuf))
				3214	return kbuf;
				3215	req->rw.addr = (u64) (unsigned long) kbuf;
				3216	req->flags \|= REQ_F_BUFFER_SELECTED;
				3217	return u64_to_user_ptr(kbuf->addr);
				3218	}
				3219
				3220	#ifdef CONFIG_COMPAT
				3221	static ssize_t io_compat_import(struct io_kiocb req, struct iovec iov,
				3222	bool needs_lock)
				3223	{
				3224	struct compat_iovec __user *uiov;
				3225	compat_ssize_t clen;
				3226	void __user *buf;
				3227	ssize_t len;
				3228
				3229	uiov = u64_to_user_ptr(req->rw.addr);
				3230	if (!access_ok(uiov, sizeof(*uiov)))
				3231	return -EFAULT;
				3232	if (__get_user(clen, &uiov->iov_len))
				3233	return -EFAULT;
				3234	if (clen < 0)
				3235	return -EINVAL;
				3236
				3237	len = clen;
				3238	buf = io_rw_buffer_select(req, &len, needs_lock);
				3239	if (IS_ERR(buf))
				3240	return PTR_ERR(buf);
				3241	iov[0].iov_base = buf;
				3242	iov[0].iov_len = (compat_size_t) len;
				3243	return 0;
				3244	}
				3245	#endif
				3246
				3247	static ssize_t __io_iov_buffer_select(struct io_kiocb req, struct iovec iov,
				3248	bool needs_lock)
				3249	{
				3250	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
				3251	void __user *buf;
				3252	ssize_t len;
				3253
				3254	if (copy_from_user(iov, uiov, sizeof(*uiov)))
				3255	return -EFAULT;
				3256
				3257	len = iov[0].iov_len;
				3258	if (len < 0)
				3259	return -EINVAL;
				3260	buf = io_rw_buffer_select(req, &len, needs_lock);
				3261	if (IS_ERR(buf))
				3262	return PTR_ERR(buf);
				3263	iov[0].iov_base = buf;
				3264	iov[0].iov_len = len;
				3265	return 0;
				3266	}
				3267
				3268	static ssize_t io_iov_buffer_select(struct io_kiocb req, struct iovec iov,
				3269	bool needs_lock)
				3270	{
				3271	if (req->flags & REQ_F_BUFFER_SELECTED) {
				3272	struct io_buffer *kbuf;
				3273
				3274	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
				3275	iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
				3276	iov[0].iov_len = kbuf->len;
				3277	return 0;
				3278	}
				3279	if (req->rw.len != 1)
				3280	return -EINVAL;
				3281
				3282	#ifdef CONFIG_COMPAT
				3283	if (req->ctx->compat)
				3284	return io_compat_import(req, iov, needs_lock);
				3285	#endif
				3286
				3287	return __io_iov_buffer_select(req, iov, needs_lock);
				3288	}
				3289
				3290	static int io_import_iovec(int rw, struct io_kiocb req, struct iovec *iovec,
				3291	struct iov_iter *iter, bool needs_lock)
				3292	{
				3293	void __user *buf = u64_to_user_ptr(req->rw.addr);
				3294	size_t sqe_len = req->rw.len;
				3295	u8 opcode = req->opcode;
				3296	ssize_t ret;
				3297
				3298	if (opcode == IORING_OP_READ_FIXED \|\| opcode == IORING_OP_WRITE_FIXED) {
				3299	*iovec = NULL;
				3300	return io_import_fixed(req, rw, iter);
				3301	}
				3302
				3303	/* buffer index only valid with fixed read/write, or buffer select */
				3304	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
				3305	return -EINVAL;
				3306
				3307	if (opcode == IORING_OP_READ \|\| opcode == IORING_OP_WRITE) {
				3308	if (req->flags & REQ_F_BUFFER_SELECT) {
				3309	buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
				3310	if (IS_ERR(buf))
				3311	return PTR_ERR(buf);
				3312	req->rw.len = sqe_len;
				3313	}
				3314
				3315	ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
				3316	*iovec = NULL;
				3317	return ret;
				3318	}
				3319
				3320	if (req->flags & REQ_F_BUFFER_SELECT) {
				3321	ret = io_iov_buffer_select(req, *iovec, needs_lock);
				3322	if (!ret)
				3323	iov_iter_init(iter, rw, iovec, 1, (iovec)->iov_len);
				3324	*iovec = NULL;
				3325	return ret;
				3326	}
				3327
				3328	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
				3329	req->ctx->compat);
				3330	}
				3331
				3332	static inline loff_t io_kiocb_ppos(struct kiocb kiocb)
				3333	{
				3334	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
				3335	}
				3336
				3337	/*
				3338	* For files that don't have ->read_iter() and ->write_iter(), handle them
				3339	* by looping over ->read() or ->write() manually.
				3340	*/
				3341	static ssize_t loop_rw_iter(int rw, struct io_kiocb req, struct iov_iter iter)
				3342	{
				3343	struct kiocb *kiocb = &req->rw.kiocb;
				3344	struct file *file = req->file;
				3345	ssize_t ret = 0;
Dylan Yudaken	b495537	2022-02-22 02:55:01 -0800	[diff] [blame]	3346	loff_t *ppos;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3347
				3348	/*
				3349	* Don't support polled IO through this interface, and we can't
				3350	* support non-blocking either. For the latter, this just causes
				3351	* the kiocb to be handled from an async context.
				3352	*/
				3353	if (kiocb->ki_flags & IOCB_HIPRI)
				3354	return -EOPNOTSUPP;
				3355	if (kiocb->ki_flags & IOCB_NOWAIT)
				3356	return -EAGAIN;
				3357
Dylan Yudaken	b495537	2022-02-22 02:55:01 -0800	[diff] [blame]	3358	ppos = io_kiocb_ppos(kiocb);
				3359
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3360	while (iov_iter_count(iter)) {
				3361	struct iovec iovec;
				3362	ssize_t nr;
				3363
				3364	if (!iov_iter_is_bvec(iter)) {
				3365	iovec = iov_iter_iovec(iter);
				3366	} else {
				3367	iovec.iov_base = u64_to_user_ptr(req->rw.addr);
				3368	iovec.iov_len = req->rw.len;
				3369	}
				3370
				3371	if (rw == READ) {
				3372	nr = file->f_op->read(file, iovec.iov_base,
Dylan Yudaken	b495537	2022-02-22 02:55:01 -0800	[diff] [blame]	3373	iovec.iov_len, ppos);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3374	} else {
				3375	nr = file->f_op->write(file, iovec.iov_base,
Dylan Yudaken	b495537	2022-02-22 02:55:01 -0800	[diff] [blame]	3376	iovec.iov_len, ppos);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3377	}
				3378
				3379	if (nr < 0) {
				3380	if (!ret)
				3381	ret = nr;
				3382	break;
				3383	}
				3384	ret += nr;
				3385	if (!iov_iter_is_bvec(iter)) {
				3386	iov_iter_advance(iter, nr);
				3387	} else {
				3388	req->rw.addr += nr;
				3389	req->rw.len -= nr;
				3390	if (!req->rw.len)
				3391	break;
				3392	}
				3393	if (nr != iovec.iov_len)
				3394	break;
				3395	}
				3396
				3397	return ret;
				3398	}
				3399
				3400	static void io_req_map_rw(struct io_kiocb req, const struct iovec iovec,
				3401	const struct iovec fast_iov, struct iov_iter iter)
				3402	{
				3403	struct io_async_rw *rw = req->async_data;
				3404
				3405	memcpy(&rw->iter, iter, sizeof(*iter));
				3406	rw->free_iovec = iovec;
				3407	rw->bytes_done = 0;
				3408	/* can only be fixed buffers, no need to do anything */
				3409	if (iov_iter_is_bvec(iter))
				3410	return;
				3411	if (!iovec) {
				3412	unsigned iov_off = 0;
				3413
				3414	rw->iter.iov = rw->fast_iov;
				3415	if (iter->iov != fast_iov) {
				3416	iov_off = iter->iov - fast_iov;
				3417	rw->iter.iov += iov_off;
				3418	}
				3419	if (rw->fast_iov != fast_iov)
				3420	memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
				3421	sizeof(struct iovec) * iter->nr_segs);
				3422	} else {
				3423	req->flags \|= REQ_F_NEED_CLEANUP;
				3424	}
				3425	}
				3426
				3427	static inline int io_alloc_async_data(struct io_kiocb *req)
				3428	{
				3429	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
				3430	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
				3431	return req->async_data == NULL;
				3432	}
				3433
				3434	static int io_setup_async_rw(struct io_kiocb req, const struct iovec iovec,
				3435	const struct iovec *fast_iov,
				3436	struct iov_iter *iter, bool force)
				3437	{
				3438	if (!force && !io_op_defs[req->opcode].needs_async_setup)
				3439	return 0;
				3440	if (!req->async_data) {
				3441	struct io_async_rw *iorw;
				3442
				3443	if (io_alloc_async_data(req)) {
				3444	kfree(iovec);
				3445	return -ENOMEM;
				3446	}
				3447
				3448	io_req_map_rw(req, iovec, fast_iov, iter);
				3449	iorw = req->async_data;
				3450	/* we've copied and mapped the iter, ensure state is saved */
				3451	iov_iter_save_state(&iorw->iter, &iorw->iter_state);
				3452	}
				3453	return 0;
				3454	}
				3455
				3456	static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
				3457	{
				3458	struct io_async_rw *iorw = req->async_data;
				3459	struct iovec *iov = iorw->fast_iov;
				3460	int ret;
				3461
				3462	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
				3463	if (unlikely(ret < 0))
				3464	return ret;
				3465
				3466	iorw->bytes_done = 0;
				3467	iorw->free_iovec = iov;
				3468	if (iov)
				3469	req->flags \|= REQ_F_NEED_CLEANUP;
				3470	iov_iter_save_state(&iorw->iter, &iorw->iter_state);
				3471	return 0;
				3472	}
				3473
				3474	static int io_read_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				3475	{
				3476	if (unlikely(!(req->file->f_mode & FMODE_READ)))
				3477	return -EBADF;
				3478	return io_prep_rw(req, sqe, READ);
				3479	}
				3480
				3481	/*
				3482	* This is our waitqueue callback handler, registered through lock_page_async()
				3483	* when we initially tried to do the IO with the iocb armed our waitqueue.
				3484	* This gets called when the page is unlocked, and we generally expect that to
				3485	* happen when the page IO is completed and the page is now uptodate. This will
				3486	* queue a task_work based retry of the operation, attempting to copy the data
				3487	* again. If the latter fails because the page was NOT uptodate, then we will
				3488	* do a thread based blocking retry of the operation. That's the unexpected
				3489	* slow path.
				3490	*/
				3491	static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
				3492	int sync, void *arg)
				3493	{
				3494	struct wait_page_queue *wpq;
				3495	struct io_kiocb *req = wait->private;
				3496	struct wait_page_key *key = arg;
				3497
				3498	wpq = container_of(wait, struct wait_page_queue, wait);
				3499
				3500	if (!wake_page_match(wpq, key))
				3501	return 0;
				3502
				3503	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
				3504	list_del_init(&wait->entry);
				3505	io_req_task_queue(req);
				3506	return 1;
				3507	}
				3508
				3509	/*
				3510	* This controls whether a given IO request should be armed for async page
				3511	* based retry. If we return false here, the request is handed to the async
				3512	* worker threads for retry. If we're doing buffered reads on a regular file,
				3513	* we prepare a private wait_page_queue entry and retry the operation. This
				3514	* will either succeed because the page is now uptodate and unlocked, or it
				3515	* will register a callback when the page is unlocked at IO completion. Through
				3516	* that callback, io_uring uses task_work to setup a retry of the operation.
				3517	* That retry will attempt the buffered read again. The retry will generally
				3518	* succeed, or in rare cases where it fails, we then fall back to using the
				3519	* async worker threads for a blocking retry.
				3520	*/
				3521	static bool io_rw_should_retry(struct io_kiocb *req)
				3522	{
				3523	struct io_async_rw *rw = req->async_data;
				3524	struct wait_page_queue *wait = &rw->wpq;
				3525	struct kiocb *kiocb = &req->rw.kiocb;
				3526
				3527	/* never retry for NOWAIT, we just complete with -EAGAIN */
				3528	if (req->flags & REQ_F_NOWAIT)
				3529	return false;
				3530
				3531	/* Only for buffered IO */
				3532	if (kiocb->ki_flags & (IOCB_DIRECT \| IOCB_HIPRI))
				3533	return false;
				3534
				3535	/*
				3536	* just use poll if we can, and don't attempt if the fs doesn't
				3537	* support callback based unlocks
				3538	*/
				3539	if (file_can_poll(req->file) \|\| !(req->file->f_mode & FMODE_BUF_RASYNC))
				3540	return false;
				3541
				3542	wait->wait.func = io_async_buf_func;
				3543	wait->wait.private = req;
				3544	wait->wait.flags = 0;
				3545	INIT_LIST_HEAD(&wait->wait.entry);
				3546	kiocb->ki_flags \|= IOCB_WAITQ;
				3547	kiocb->ki_flags &= ~IOCB_NOWAIT;
				3548	kiocb->ki_waitq = wait;
				3549	return true;
				3550	}
				3551
				3552	static inline int io_iter_do_read(struct io_kiocb req, struct iov_iter iter)
				3553	{
				3554	if (req->file->f_op->read_iter)
				3555	return call_read_iter(req->file, &req->rw.kiocb, iter);
				3556	else if (req->file->f_op->read)
				3557	return loop_rw_iter(READ, req, iter);
				3558	else
				3559	return -EINVAL;
				3560	}
				3561
				3562	static bool need_read_all(struct io_kiocb *req)
				3563	{
				3564	return req->flags & REQ_F_ISREG \|\|
				3565	S_ISBLK(file_inode(req->file)->i_mode);
				3566	}
				3567
				3568	static int io_read(struct io_kiocb *req, unsigned int issue_flags)
				3569	{
				3570	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				3571	struct kiocb *kiocb = &req->rw.kiocb;
				3572	struct iov_iter __iter, *iter = &__iter;
				3573	struct io_async_rw *rw = req->async_data;
				3574	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				3575	struct iov_iter_state __state, *state;
				3576	ssize_t ret, ret2;
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3577	loff_t *ppos;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3578
				3579	if (rw) {
				3580	iter = &rw->iter;
				3581	state = &rw->iter_state;
				3582	/*
				3583	* We come here from an earlier attempt, restore our state to
				3584	* match in case it doesn't. It's cheap enough that we don't
				3585	* need to make this conditional.
				3586	*/
				3587	iov_iter_restore(iter, state);
				3588	iovec = NULL;
				3589	} else {
				3590	ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
				3591	if (ret < 0)
				3592	return ret;
				3593	state = &__state;
				3594	iov_iter_save_state(iter, state);
				3595	}
				3596	req->result = iov_iter_count(iter);
				3597
				3598	/* Ensure we clear previously set non-block flag */
				3599	if (!force_nonblock)
				3600	kiocb->ki_flags &= ~IOCB_NOWAIT;
				3601	else
				3602	kiocb->ki_flags \|= IOCB_NOWAIT;
				3603
				3604	/* If the file doesn't support async, just async punt */
				3605	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
				3606	ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
				3607	return ret ?: -EAGAIN;
				3608	}
				3609
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3610	ppos = io_kiocb_update_pos(req);
Dylan Yudaken	26b7575	2022-02-22 02:55:02 -0800	[diff] [blame]	3611
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3612	ret = rw_verify_area(READ, req->file, ppos, req->result);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3613	if (unlikely(ret)) {
				3614	kfree(iovec);
				3615	return ret;
				3616	}
				3617
				3618	ret = io_iter_do_read(req, iter);
				3619
				3620	if (ret == -EAGAIN \|\| (req->flags & REQ_F_REISSUE)) {
				3621	req->flags &= ~REQ_F_REISSUE;
				3622	/* IOPOLL retry should happen for io-wq threads */
				3623	if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
				3624	goto done;
				3625	/* no retry on NONBLOCK nor RWF_NOWAIT */
				3626	if (req->flags & REQ_F_NOWAIT)
				3627	goto done;
				3628	ret = 0;
				3629	} else if (ret == -EIOCBQUEUED) {
				3630	goto out_free;
				3631	} else if (ret <= 0 \|\| ret == req->result \|\| !force_nonblock \|\|
				3632	(req->flags & REQ_F_NOWAIT) \|\| !need_read_all(req)) {
				3633	/* read all, failed, already did sync or don't want to retry */
				3634	goto done;
				3635	}
				3636
				3637	/*
				3638	* Don't depend on the iter state matching what was consumed, or being
				3639	* untouched in case of error. Restore it and we'll advance it
				3640	* manually if we need to.
				3641	*/
				3642	iov_iter_restore(iter, state);
				3643
				3644	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
				3645	if (ret2)
				3646	return ret2;
				3647
				3648	iovec = NULL;
				3649	rw = req->async_data;
				3650	/*
				3651	* Now use our persistent iterator and state, if we aren't already.
				3652	* We've restored and mapped the iter to match.
				3653	*/
				3654	if (iter != &rw->iter) {
				3655	iter = &rw->iter;
				3656	state = &rw->iter_state;
				3657	}
				3658
				3659	do {
				3660	/*
				3661	* We end up here because of a partial read, either from
				3662	* above or inside this loop. Advance the iter by the bytes
				3663	* that were consumed.
				3664	*/
				3665	iov_iter_advance(iter, ret);
				3666	if (!iov_iter_count(iter))
				3667	break;
				3668	rw->bytes_done += ret;
				3669	iov_iter_save_state(iter, state);
				3670
				3671	/* if we can retry, do so with the callbacks armed */
				3672	if (!io_rw_should_retry(req)) {
				3673	kiocb->ki_flags &= ~IOCB_WAITQ;
				3674	return -EAGAIN;
				3675	}
				3676
				3677	req->result = iov_iter_count(iter);
				3678	/*
				3679	* Now retry read with the IOCB_WAITQ parts set in the iocb. If
				3680	* we get -EIOCBQUEUED, then we'll get a notification when the
				3681	* desired page gets unlocked. We can also get a partial read
				3682	* here, and if we do, then just retry at the new offset.
				3683	*/
				3684	ret = io_iter_do_read(req, iter);
				3685	if (ret == -EIOCBQUEUED)
				3686	return 0;
				3687	/* we got some bytes, but not all. retry. */
				3688	kiocb->ki_flags &= ~IOCB_WAITQ;
				3689	iov_iter_restore(iter, state);
				3690	} while (ret > 0);
				3691	done:
				3692	kiocb_done(kiocb, ret, issue_flags);
				3693	out_free:
				3694	/* it's faster to check here then delegate to kfree */
				3695	if (iovec)
				3696	kfree(iovec);
				3697	return 0;
				3698	}
				3699
				3700	static int io_write_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				3701	{
				3702	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
				3703	return -EBADF;
				3704	return io_prep_rw(req, sqe, WRITE);
				3705	}
				3706
				3707	static int io_write(struct io_kiocb *req, unsigned int issue_flags)
				3708	{
				3709	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
				3710	struct kiocb *kiocb = &req->rw.kiocb;
				3711	struct iov_iter __iter, *iter = &__iter;
				3712	struct io_async_rw *rw = req->async_data;
				3713	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				3714	struct iov_iter_state __state, *state;
				3715	ssize_t ret, ret2;
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3716	loff_t *ppos;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3717
				3718	if (rw) {
				3719	iter = &rw->iter;
				3720	state = &rw->iter_state;
				3721	iov_iter_restore(iter, state);
				3722	iovec = NULL;
				3723	} else {
				3724	ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
				3725	if (ret < 0)
				3726	return ret;
				3727	state = &__state;
				3728	iov_iter_save_state(iter, state);
				3729	}
				3730	req->result = iov_iter_count(iter);
				3731
				3732	/* Ensure we clear previously set non-block flag */
				3733	if (!force_nonblock)
				3734	kiocb->ki_flags &= ~IOCB_NOWAIT;
				3735	else
				3736	kiocb->ki_flags \|= IOCB_NOWAIT;
				3737
				3738	/* If the file doesn't support async, just async punt */
				3739	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
				3740	goto copy_iov;
				3741
				3742	/* file path doesn't support NOWAIT for non-direct_IO */
				3743	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
				3744	(req->flags & REQ_F_ISREG))
				3745	goto copy_iov;
				3746
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3747	ppos = io_kiocb_update_pos(req);
Dylan Yudaken	26b7575	2022-02-22 02:55:02 -0800	[diff] [blame]	3748
Dylan Yudaken	3ad7306	2022-02-22 02:55:03 -0800	[diff] [blame]	3749	ret = rw_verify_area(WRITE, req->file, ppos, req->result);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3750	if (unlikely(ret))
				3751	goto out_free;
				3752
				3753	/*
				3754	* Open-code file_start_write here to grab freeze protection,
				3755	* which will be released by another thread in
				3756	* io_complete_rw(). Fool lockdep by telling it the lock got
				3757	* released so that it doesn't complain about the held lock when
				3758	* we return to userspace.
				3759	*/
				3760	if (req->flags & REQ_F_ISREG) {
				3761	sb_start_write(file_inode(req->file)->i_sb);
				3762	__sb_writers_release(file_inode(req->file)->i_sb,
				3763	SB_FREEZE_WRITE);
				3764	}
				3765	kiocb->ki_flags \|= IOCB_WRITE;
				3766
				3767	if (req->file->f_op->write_iter)
				3768	ret2 = call_write_iter(req->file, kiocb, iter);
				3769	else if (req->file->f_op->write)
				3770	ret2 = loop_rw_iter(WRITE, req, iter);
				3771	else
				3772	ret2 = -EINVAL;
				3773
				3774	if (req->flags & REQ_F_REISSUE) {
				3775	req->flags &= ~REQ_F_REISSUE;
				3776	ret2 = -EAGAIN;
				3777	}
				3778
				3779	/*
				3780	* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
				3781	* retry them without IOCB_NOWAIT.
				3782	*/
				3783	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
				3784	ret2 = -EAGAIN;
				3785	/* no retry on NONBLOCK nor RWF_NOWAIT */
				3786	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
				3787	goto done;
				3788	if (!force_nonblock \|\| ret2 != -EAGAIN) {
				3789	/* IOPOLL retry should happen for io-wq threads */
				3790	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
				3791	goto copy_iov;
				3792	done:
				3793	kiocb_done(kiocb, ret2, issue_flags);
				3794	} else {
				3795	copy_iov:
				3796	iov_iter_restore(iter, state);
				3797	ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
				3798	if (!ret) {
				3799	if (kiocb->ki_flags & IOCB_WRITE)
				3800	kiocb_end_write(req);
				3801	return -EAGAIN;
				3802	}
				3803	return ret;
				3804	}
				3805	out_free:
				3806	/* it's reportedly faster than delegating the null check to kfree() */
				3807	if (iovec)
				3808	kfree(iovec);
				3809	return ret;
				3810	}
				3811
				3812	static int io_renameat_prep(struct io_kiocb *req,
				3813	const struct io_uring_sqe *sqe)
				3814	{
				3815	struct io_rename *ren = &req->rename;
				3816	const char __user oldf, newf;
				3817
				3818	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				3819	return -EINVAL;
				3820	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->splice_fd_in)
				3821	return -EINVAL;
				3822	if (unlikely(req->flags & REQ_F_FIXED_FILE))
				3823	return -EBADF;
				3824
				3825	ren->old_dfd = READ_ONCE(sqe->fd);
				3826	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
				3827	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
				3828	ren->new_dfd = READ_ONCE(sqe->len);
				3829	ren->flags = READ_ONCE(sqe->rename_flags);
				3830
				3831	ren->oldpath = getname(oldf);
				3832	if (IS_ERR(ren->oldpath))
				3833	return PTR_ERR(ren->oldpath);
				3834
				3835	ren->newpath = getname(newf);
				3836	if (IS_ERR(ren->newpath)) {
				3837	putname(ren->oldpath);
				3838	return PTR_ERR(ren->newpath);
				3839	}
				3840
				3841	req->flags \|= REQ_F_NEED_CLEANUP;
				3842	return 0;
				3843	}
				3844
				3845	static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
				3846	{
				3847	struct io_rename *ren = &req->rename;
				3848	int ret;
				3849
				3850	if (issue_flags & IO_URING_F_NONBLOCK)
				3851	return -EAGAIN;
				3852
				3853	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
				3854	ren->newpath, ren->flags);
				3855
				3856	req->flags &= ~REQ_F_NEED_CLEANUP;
				3857	if (ret < 0)
				3858	req_set_fail(req);
				3859	io_req_complete(req, ret);
				3860	return 0;
				3861	}
				3862
				3863	static int io_unlinkat_prep(struct io_kiocb *req,
				3864	const struct io_uring_sqe *sqe)
				3865	{
				3866	struct io_unlink *un = &req->unlink;
				3867	const char __user *fname;
				3868
				3869	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				3870	return -EINVAL;
				3871	if (sqe->ioprio \|\| sqe->off \|\| sqe->len \|\| sqe->buf_index \|\|
				3872	sqe->splice_fd_in)
				3873	return -EINVAL;
				3874	if (unlikely(req->flags & REQ_F_FIXED_FILE))
				3875	return -EBADF;
				3876
				3877	un->dfd = READ_ONCE(sqe->fd);
				3878
				3879	un->flags = READ_ONCE(sqe->unlink_flags);
				3880	if (un->flags & ~AT_REMOVEDIR)
				3881	return -EINVAL;
				3882
				3883	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
				3884	un->filename = getname(fname);
				3885	if (IS_ERR(un->filename))
				3886	return PTR_ERR(un->filename);
				3887
				3888	req->flags \|= REQ_F_NEED_CLEANUP;
				3889	return 0;
				3890	}
				3891
				3892	static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
				3893	{
				3894	struct io_unlink *un = &req->unlink;
				3895	int ret;
				3896
				3897	if (issue_flags & IO_URING_F_NONBLOCK)
				3898	return -EAGAIN;
				3899
				3900	if (un->flags & AT_REMOVEDIR)
				3901	ret = do_rmdir(un->dfd, un->filename);
				3902	else
				3903	ret = do_unlinkat(un->dfd, un->filename);
				3904
				3905	req->flags &= ~REQ_F_NEED_CLEANUP;
				3906	if (ret < 0)
				3907	req_set_fail(req);
				3908	io_req_complete(req, ret);
				3909	return 0;
				3910	}
				3911
				3912	static int io_shutdown_prep(struct io_kiocb *req,
				3913	const struct io_uring_sqe *sqe)
				3914	{
				3915	#if defined(CONFIG_NET)
				3916	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				3917	return -EINVAL;
				3918	if (unlikely(sqe->ioprio \|\| sqe->off \|\| sqe->addr \|\| sqe->rw_flags \|\|
				3919	sqe->buf_index \|\| sqe->splice_fd_in))
				3920	return -EINVAL;
				3921
				3922	req->shutdown.how = READ_ONCE(sqe->len);
				3923	return 0;
				3924	#else
				3925	return -EOPNOTSUPP;
				3926	#endif
				3927	}
				3928
				3929	static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
				3930	{
				3931	#if defined(CONFIG_NET)
				3932	struct socket *sock;
				3933	int ret;
				3934
				3935	if (issue_flags & IO_URING_F_NONBLOCK)
				3936	return -EAGAIN;
				3937
				3938	sock = sock_from_file(req->file, &ret);
				3939	if (unlikely(!sock))
				3940	return ret;
				3941
				3942	ret = __sys_shutdown_sock(sock, req->shutdown.how);
				3943	if (ret < 0)
				3944	req_set_fail(req);
				3945	io_req_complete(req, ret);
				3946	return 0;
				3947	#else
				3948	return -EOPNOTSUPP;
				3949	#endif
				3950	}
				3951
				3952	static int __io_splice_prep(struct io_kiocb *req,
				3953	const struct io_uring_sqe *sqe)
				3954	{
				3955	struct io_splice *sp = &req->splice;
				3956	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED \| SPLICE_F_ALL;
				3957
				3958	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				3959	return -EINVAL;
				3960
				3961	sp->len = READ_ONCE(sqe->len);
				3962	sp->flags = READ_ONCE(sqe->splice_flags);
				3963	if (unlikely(sp->flags & ~valid_flags))
				3964	return -EINVAL;
				3965	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
				3966	return 0;
				3967	}
				3968
				3969	static int io_tee_prep(struct io_kiocb *req,
				3970	const struct io_uring_sqe *sqe)
				3971	{
				3972	if (READ_ONCE(sqe->splice_off_in) \|\| READ_ONCE(sqe->off))
				3973	return -EINVAL;
				3974	return __io_splice_prep(req, sqe);
				3975	}
				3976
				3977	static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
				3978	{
				3979	struct io_splice *sp = &req->splice;
				3980	struct file *out = sp->file_out;
				3981	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
				3982	struct file *in;
				3983	long ret = 0;
				3984
				3985	if (issue_flags & IO_URING_F_NONBLOCK)
				3986	return -EAGAIN;
				3987
				3988	in = io_file_get(req->ctx, req, sp->splice_fd_in,
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	3989	(sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	3990	if (!in) {
				3991	ret = -EBADF;
				3992	goto done;
				3993	}
				3994
				3995	if (sp->len)
				3996	ret = do_tee(in, out, sp->len, flags);
				3997
				3998	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
				3999	io_put_file(in);
				4000	done:
				4001	if (ret != sp->len)
				4002	req_set_fail(req);
				4003	io_req_complete(req, ret);
				4004	return 0;
				4005	}
				4006
				4007	static int io_splice_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4008	{
				4009	struct io_splice *sp = &req->splice;
				4010
				4011	sp->off_in = READ_ONCE(sqe->splice_off_in);
				4012	sp->off_out = READ_ONCE(sqe->off);
				4013	return __io_splice_prep(req, sqe);
				4014	}
				4015
				4016	static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
				4017	{
				4018	struct io_splice *sp = &req->splice;
				4019	struct file *out = sp->file_out;
				4020	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
				4021	loff_t poff_in, poff_out;
				4022	struct file *in;
				4023	long ret = 0;
				4024
				4025	if (issue_flags & IO_URING_F_NONBLOCK)
				4026	return -EAGAIN;
				4027
				4028	in = io_file_get(req->ctx, req, sp->splice_fd_in,
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	4029	(sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4030	if (!in) {
				4031	ret = -EBADF;
				4032	goto done;
				4033	}
				4034
				4035	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
				4036	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
				4037
				4038	if (sp->len)
				4039	ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
				4040
				4041	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
				4042	io_put_file(in);
				4043	done:
				4044	if (ret != sp->len)
				4045	req_set_fail(req);
				4046	io_req_complete(req, ret);
				4047	return 0;
				4048	}
				4049
				4050	/*
				4051	* IORING_OP_NOP just posts a completion event, nothing else.
				4052	*/
				4053	static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
				4054	{
				4055	struct io_ring_ctx *ctx = req->ctx;
				4056
				4057	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				4058	return -EINVAL;
				4059
				4060	__io_req_complete(req, issue_flags, 0, 0);
				4061	return 0;
				4062	}
				4063
				4064	static int io_fsync_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4065	{
				4066	struct io_ring_ctx *ctx = req->ctx;
				4067
				4068	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				4069	return -EINVAL;
				4070	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index \|\|
				4071	sqe->splice_fd_in))
				4072	return -EINVAL;
				4073
				4074	req->sync.flags = READ_ONCE(sqe->fsync_flags);
				4075	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
				4076	return -EINVAL;
				4077
				4078	req->sync.off = READ_ONCE(sqe->off);
				4079	req->sync.len = READ_ONCE(sqe->len);
				4080	return 0;
				4081	}
				4082
				4083	static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
				4084	{
				4085	loff_t end = req->sync.off + req->sync.len;
				4086	int ret;
				4087
				4088	/* fsync always requires a blocking context */
				4089	if (issue_flags & IO_URING_F_NONBLOCK)
				4090	return -EAGAIN;
				4091
				4092	ret = vfs_fsync_range(req->file, req->sync.off,
				4093	end > 0 ? end : LLONG_MAX,
				4094	req->sync.flags & IORING_FSYNC_DATASYNC);
				4095	if (ret < 0)
				4096	req_set_fail(req);
				4097	io_req_complete(req, ret);
				4098	return 0;
				4099	}
				4100
				4101	static int io_fallocate_prep(struct io_kiocb *req,
				4102	const struct io_uring_sqe *sqe)
				4103	{
				4104	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->rw_flags \|\|
				4105	sqe->splice_fd_in)
				4106	return -EINVAL;
				4107	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4108	return -EINVAL;
				4109
				4110	req->sync.off = READ_ONCE(sqe->off);
				4111	req->sync.len = READ_ONCE(sqe->addr);
				4112	req->sync.mode = READ_ONCE(sqe->len);
				4113	return 0;
				4114	}
				4115
				4116	static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
				4117	{
				4118	int ret;
				4119
				4120	/* fallocate always requiring blocking context */
				4121	if (issue_flags & IO_URING_F_NONBLOCK)
				4122	return -EAGAIN;
				4123	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
				4124	req->sync.len);
				4125	if (ret < 0)
				4126	req_set_fail(req);
				4127	else
				4128	fsnotify_modify(req->file);
				4129	io_req_complete(req, ret);
				4130	return 0;
				4131	}
				4132
				4133	static int __io_openat_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4134	{
				4135	const char __user *fname;
				4136	int ret;
				4137
				4138	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4139	return -EINVAL;
				4140	if (unlikely(sqe->ioprio \|\| sqe->buf_index))
				4141	return -EINVAL;
				4142	if (unlikely(req->flags & REQ_F_FIXED_FILE))
				4143	return -EBADF;
				4144
				4145	/* open.how should be already initialised */
				4146	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
				4147	req->open.how.flags \|= O_LARGEFILE;
				4148
				4149	req->open.dfd = READ_ONCE(sqe->fd);
				4150	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
				4151	req->open.filename = getname(fname);
				4152	if (IS_ERR(req->open.filename)) {
				4153	ret = PTR_ERR(req->open.filename);
				4154	req->open.filename = NULL;
				4155	return ret;
				4156	}
				4157
				4158	req->open.file_slot = READ_ONCE(sqe->file_index);
				4159	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
				4160	return -EINVAL;
				4161
				4162	req->open.nofile = rlimit(RLIMIT_NOFILE);
				4163	req->flags \|= REQ_F_NEED_CLEANUP;
				4164	return 0;
				4165	}
				4166
				4167	static int io_openat_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4168	{
				4169	u64 mode = READ_ONCE(sqe->len);
				4170	u64 flags = READ_ONCE(sqe->open_flags);
				4171
				4172	req->open.how = build_open_how(flags, mode);
				4173	return __io_openat_prep(req, sqe);
				4174	}
				4175
				4176	static int io_openat2_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4177	{
				4178	struct open_how __user *how;
				4179	size_t len;
				4180	int ret;
				4181
				4182	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
				4183	len = READ_ONCE(sqe->len);
				4184	if (len < OPEN_HOW_SIZE_VER0)
				4185	return -EINVAL;
				4186
				4187	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
				4188	len);
				4189	if (ret)
				4190	return ret;
				4191
				4192	return __io_openat_prep(req, sqe);
				4193	}
				4194
				4195	static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
				4196	{
				4197	struct open_flags op;
				4198	struct file *file;
				4199	bool resolve_nonblock, nonblock_set;
				4200	bool fixed = !!req->open.file_slot;
				4201	int ret;
				4202
				4203	ret = build_open_flags(&req->open.how, &op);
				4204	if (ret)
				4205	goto err;
				4206	nonblock_set = op.open_flag & O_NONBLOCK;
				4207	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
				4208	if (issue_flags & IO_URING_F_NONBLOCK) {
				4209	/*
				4210	* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
				4211	* it'll always -EAGAIN
				4212	*/
				4213	if (req->open.how.flags & (O_TRUNC \| O_CREAT \| O_TMPFILE))
				4214	return -EAGAIN;
				4215	op.lookup_flags \|= LOOKUP_CACHED;
				4216	op.open_flag \|= O_NONBLOCK;
				4217	}
				4218
				4219	if (!fixed) {
				4220	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
				4221	if (ret < 0)
				4222	goto err;
				4223	}
				4224
				4225	file = do_filp_open(req->open.dfd, req->open.filename, &op);
				4226	if (IS_ERR(file)) {
				4227	/*
				4228	* We could hang on to this 'fd' on retrying, but seems like
				4229	* marginal gain for something that is now known to be a slower
				4230	* path. So just put it, and we'll get a new one when we retry.
				4231	*/
				4232	if (!fixed)
				4233	put_unused_fd(ret);
				4234
				4235	ret = PTR_ERR(file);
				4236	/* only retry if RESOLVE_CACHED wasn't already set by application */
				4237	if (ret == -EAGAIN &&
				4238	(!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
				4239	return -EAGAIN;
				4240	goto err;
				4241	}
				4242
				4243	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
				4244	file->f_flags &= ~O_NONBLOCK;
				4245	fsnotify_open(file);
				4246
				4247	if (!fixed)
				4248	fd_install(ret, file);
				4249	else
				4250	ret = io_install_fixed_file(req, file, issue_flags,
				4251	req->open.file_slot - 1);
				4252	err:
				4253	putname(req->open.filename);
				4254	req->flags &= ~REQ_F_NEED_CLEANUP;
				4255	if (ret < 0)
				4256	req_set_fail(req);
				4257	__io_req_complete(req, issue_flags, ret, 0);
				4258	return 0;
				4259	}
				4260
				4261	static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
				4262	{
				4263	return io_openat2(req, issue_flags);
				4264	}
				4265
				4266	static int io_remove_buffers_prep(struct io_kiocb *req,
				4267	const struct io_uring_sqe *sqe)
				4268	{
				4269	struct io_provide_buf *p = &req->pbuf;
				4270	u64 tmp;
				4271
				4272	if (sqe->ioprio \|\| sqe->rw_flags \|\| sqe->addr \|\| sqe->len \|\| sqe->off \|\|
				4273	sqe->splice_fd_in)
				4274	return -EINVAL;
				4275
				4276	tmp = READ_ONCE(sqe->fd);
				4277	if (!tmp \|\| tmp > USHRT_MAX)
				4278	return -EINVAL;
				4279
				4280	memset(p, 0, sizeof(*p));
				4281	p->nbufs = tmp;
				4282	p->bgid = READ_ONCE(sqe->buf_group);
				4283	return 0;
				4284	}
				4285
				4286	static int __io_remove_buffers(struct io_ring_ctx ctx, struct io_buffer buf,
				4287	int bgid, unsigned nbufs)
				4288	{
				4289	unsigned i = 0;
				4290
				4291	/* shouldn't happen */
				4292	if (!nbufs)
				4293	return 0;
				4294
				4295	/* the head kbuf is the list itself */
				4296	while (!list_empty(&buf->list)) {
				4297	struct io_buffer *nxt;
				4298
				4299	nxt = list_first_entry(&buf->list, struct io_buffer, list);
				4300	list_del(&nxt->list);
				4301	kfree(nxt);
				4302	if (++i == nbufs)
				4303	return i;
				4304	cond_resched();
				4305	}
				4306	i++;
				4307	kfree(buf);
				4308	xa_erase(&ctx->io_buffers, bgid);
				4309
				4310	return i;
				4311	}
				4312
				4313	static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
				4314	{
				4315	struct io_provide_buf *p = &req->pbuf;
				4316	struct io_ring_ctx *ctx = req->ctx;
				4317	struct io_buffer *head;
				4318	int ret = 0;
				4319	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				4320
				4321	io_ring_submit_lock(ctx, !force_nonblock);
				4322
				4323	lockdep_assert_held(&ctx->uring_lock);
				4324
				4325	ret = -ENOENT;
				4326	head = xa_load(&ctx->io_buffers, p->bgid);
				4327	if (head)
				4328	ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
				4329	if (ret < 0)
				4330	req_set_fail(req);
				4331
				4332	/* complete before unlock, IOPOLL may need the lock */
				4333	__io_req_complete(req, issue_flags, ret, 0);
				4334	io_ring_submit_unlock(ctx, !force_nonblock);
				4335	return 0;
				4336	}
				4337
				4338	static int io_provide_buffers_prep(struct io_kiocb *req,
				4339	const struct io_uring_sqe *sqe)
				4340	{
				4341	unsigned long size, tmp_check;
				4342	struct io_provide_buf *p = &req->pbuf;
				4343	u64 tmp;
				4344
				4345	if (sqe->ioprio \|\| sqe->rw_flags \|\| sqe->splice_fd_in)
				4346	return -EINVAL;
				4347
				4348	tmp = READ_ONCE(sqe->fd);
				4349	if (!tmp \|\| tmp > USHRT_MAX)
				4350	return -E2BIG;
				4351	p->nbufs = tmp;
				4352	p->addr = READ_ONCE(sqe->addr);
				4353	p->len = READ_ONCE(sqe->len);
				4354
				4355	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
				4356	&size))
				4357	return -EOVERFLOW;
				4358	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
				4359	return -EOVERFLOW;
				4360
				4361	size = (unsigned long)p->len * p->nbufs;
				4362	if (!access_ok(u64_to_user_ptr(p->addr), size))
				4363	return -EFAULT;
				4364
				4365	p->bgid = READ_ONCE(sqe->buf_group);
				4366	tmp = READ_ONCE(sqe->off);
				4367	if (tmp > USHRT_MAX)
				4368	return -E2BIG;
				4369	p->bid = tmp;
				4370	return 0;
				4371	}
				4372
				4373	static int io_add_buffers(struct io_provide_buf pbuf, struct io_buffer *head)
				4374	{
				4375	struct io_buffer *buf;
				4376	u64 addr = pbuf->addr;
				4377	int i, bid = pbuf->bid;
				4378
				4379	for (i = 0; i < pbuf->nbufs; i++) {
				4380	buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
				4381	if (!buf)
				4382	break;
				4383
				4384	buf->addr = addr;
				4385	buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
				4386	buf->bid = bid;
				4387	addr += pbuf->len;
				4388	bid++;
				4389	if (!*head) {
				4390	INIT_LIST_HEAD(&buf->list);
				4391	*head = buf;
				4392	} else {
				4393	list_add_tail(&buf->list, &(*head)->list);
				4394	}
				4395	cond_resched();
				4396	}
				4397
				4398	return i ? i : -ENOMEM;
				4399	}
				4400
				4401	static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
				4402	{
				4403	struct io_provide_buf *p = &req->pbuf;
				4404	struct io_ring_ctx *ctx = req->ctx;
				4405	struct io_buffer head, list;
				4406	int ret = 0;
				4407	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				4408
				4409	io_ring_submit_lock(ctx, !force_nonblock);
				4410
				4411	lockdep_assert_held(&ctx->uring_lock);
				4412
				4413	list = head = xa_load(&ctx->io_buffers, p->bgid);
				4414
				4415	ret = io_add_buffers(p, &head);
				4416	if (ret >= 0 && !list) {
				4417	ret = xa_insert(&ctx->io_buffers, p->bgid, head,
				4418	GFP_KERNEL_ACCOUNT);
				4419	if (ret < 0)
				4420	__io_remove_buffers(ctx, head, p->bgid, -1U);
				4421	}
				4422	if (ret < 0)
				4423	req_set_fail(req);
				4424	/* complete before unlock, IOPOLL may need the lock */
				4425	__io_req_complete(req, issue_flags, ret, 0);
				4426	io_ring_submit_unlock(ctx, !force_nonblock);
				4427	return 0;
				4428	}
				4429
				4430	static int io_epoll_ctl_prep(struct io_kiocb *req,
				4431	const struct io_uring_sqe *sqe)
				4432	{
				4433	#if defined(CONFIG_EPOLL)
				4434	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->splice_fd_in)
				4435	return -EINVAL;
				4436	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4437	return -EINVAL;
				4438
				4439	req->epoll.epfd = READ_ONCE(sqe->fd);
				4440	req->epoll.op = READ_ONCE(sqe->len);
				4441	req->epoll.fd = READ_ONCE(sqe->off);
				4442
				4443	if (ep_op_has_event(req->epoll.op)) {
				4444	struct epoll_event __user *ev;
				4445
				4446	ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
				4447	if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
				4448	return -EFAULT;
				4449	}
				4450
				4451	return 0;
				4452	#else
				4453	return -EOPNOTSUPP;
				4454	#endif
				4455	}
				4456
				4457	static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
				4458	{
				4459	#if defined(CONFIG_EPOLL)
				4460	struct io_epoll *ie = &req->epoll;
				4461	int ret;
				4462	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				4463
				4464	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
				4465	if (force_nonblock && ret == -EAGAIN)
				4466	return -EAGAIN;
				4467
				4468	if (ret < 0)
				4469	req_set_fail(req);
				4470	__io_req_complete(req, issue_flags, ret, 0);
				4471	return 0;
				4472	#else
				4473	return -EOPNOTSUPP;
				4474	#endif
				4475	}
				4476
				4477	static int io_madvise_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4478	{
				4479	#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
				4480	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->off \|\| sqe->splice_fd_in)
				4481	return -EINVAL;
				4482	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4483	return -EINVAL;
				4484
				4485	req->madvise.addr = READ_ONCE(sqe->addr);
				4486	req->madvise.len = READ_ONCE(sqe->len);
				4487	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
				4488	return 0;
				4489	#else
				4490	return -EOPNOTSUPP;
				4491	#endif
				4492	}
				4493
				4494	static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
				4495	{
				4496	#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
				4497	struct io_madvise *ma = &req->madvise;
				4498	int ret;
				4499
				4500	if (issue_flags & IO_URING_F_NONBLOCK)
				4501	return -EAGAIN;
				4502
				4503	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
				4504	if (ret < 0)
				4505	req_set_fail(req);
				4506	io_req_complete(req, ret);
				4507	return 0;
				4508	#else
				4509	return -EOPNOTSUPP;
				4510	#endif
				4511	}
				4512
				4513	static int io_fadvise_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4514	{
				4515	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->addr \|\| sqe->splice_fd_in)
				4516	return -EINVAL;
				4517	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4518	return -EINVAL;
				4519
				4520	req->fadvise.offset = READ_ONCE(sqe->off);
				4521	req->fadvise.len = READ_ONCE(sqe->len);
				4522	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
				4523	return 0;
				4524	}
				4525
				4526	static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
				4527	{
				4528	struct io_fadvise *fa = &req->fadvise;
				4529	int ret;
				4530
				4531	if (issue_flags & IO_URING_F_NONBLOCK) {
				4532	switch (fa->advice) {
				4533	case POSIX_FADV_NORMAL:
				4534	case POSIX_FADV_RANDOM:
				4535	case POSIX_FADV_SEQUENTIAL:
				4536	break;
				4537	default:
				4538	return -EAGAIN;
				4539	}
				4540	}
				4541
				4542	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
				4543	if (ret < 0)
				4544	req_set_fail(req);
				4545	__io_req_complete(req, issue_flags, ret, 0);
				4546	return 0;
				4547	}
				4548
				4549	static int io_statx_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4550	{
				4551	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4552	return -EINVAL;
				4553	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->splice_fd_in)
				4554	return -EINVAL;
				4555	if (req->flags & REQ_F_FIXED_FILE)
				4556	return -EBADF;
				4557
				4558	req->statx.dfd = READ_ONCE(sqe->fd);
				4559	req->statx.mask = READ_ONCE(sqe->len);
				4560	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
				4561	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
				4562	req->statx.flags = READ_ONCE(sqe->statx_flags);
				4563
				4564	return 0;
				4565	}
				4566
				4567	static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
				4568	{
				4569	struct io_statx *ctx = &req->statx;
				4570	int ret;
				4571
				4572	if (issue_flags & IO_URING_F_NONBLOCK)
				4573	return -EAGAIN;
				4574
				4575	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
				4576	ctx->buffer);
				4577
				4578	if (ret < 0)
				4579	req_set_fail(req);
				4580	io_req_complete(req, ret);
				4581	return 0;
				4582	}
				4583
				4584	static int io_close_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4585	{
				4586	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4587	return -EINVAL;
				4588	if (sqe->ioprio \|\| sqe->off \|\| sqe->addr \|\| sqe->len \|\|
				4589	sqe->rw_flags \|\| sqe->buf_index)
				4590	return -EINVAL;
				4591	if (req->flags & REQ_F_FIXED_FILE)
				4592	return -EBADF;
				4593
				4594	req->close.fd = READ_ONCE(sqe->fd);
				4595	req->close.file_slot = READ_ONCE(sqe->file_index);
				4596	if (req->close.file_slot && req->close.fd)
				4597	return -EINVAL;
				4598
				4599	return 0;
				4600	}
				4601
				4602	static int io_close(struct io_kiocb *req, unsigned int issue_flags)
				4603	{
				4604	struct files_struct *files = current->files;
				4605	struct io_close *close = &req->close;
				4606	struct fdtable *fdt;
				4607	struct file *file = NULL;
				4608	int ret = -EBADF;
				4609
				4610	if (req->close.file_slot) {
				4611	ret = io_close_fixed(req, issue_flags);
				4612	goto err;
				4613	}
				4614
				4615	spin_lock(&files->file_lock);
				4616	fdt = files_fdtable(files);
				4617	if (close->fd >= fdt->max_fds) {
				4618	spin_unlock(&files->file_lock);
				4619	goto err;
				4620	}
				4621	file = fdt->fd[close->fd];
				4622	if (!file \|\| file->f_op == &io_uring_fops) {
				4623	spin_unlock(&files->file_lock);
				4624	file = NULL;
				4625	goto err;
				4626	}
				4627
				4628	/* if the file has a flush method, be safe and punt to async */
				4629	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
				4630	spin_unlock(&files->file_lock);
				4631	return -EAGAIN;
				4632	}
				4633
				4634	ret = __close_fd_get_file(close->fd, &file);
				4635	spin_unlock(&files->file_lock);
				4636	if (ret < 0) {
				4637	if (ret == -ENOENT)
				4638	ret = -EBADF;
				4639	goto err;
				4640	}
				4641
				4642	/* No ->flush() or already async, safely close from here */
				4643	ret = filp_close(file, current->files);
				4644	err:
				4645	if (ret < 0)
				4646	req_set_fail(req);
				4647	if (file)
				4648	fput(file);
				4649	__io_req_complete(req, issue_flags, ret, 0);
				4650	return 0;
				4651	}
				4652
				4653	static int io_sfr_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4654	{
				4655	struct io_ring_ctx *ctx = req->ctx;
				4656
				4657	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
				4658	return -EINVAL;
				4659	if (unlikely(sqe->addr \|\| sqe->ioprio \|\| sqe->buf_index \|\|
				4660	sqe->splice_fd_in))
				4661	return -EINVAL;
				4662
				4663	req->sync.off = READ_ONCE(sqe->off);
				4664	req->sync.len = READ_ONCE(sqe->len);
				4665	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
				4666	return 0;
				4667	}
				4668
				4669	static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
				4670	{
				4671	int ret;
				4672
				4673	/* sync_file_range always requires a blocking context */
				4674	if (issue_flags & IO_URING_F_NONBLOCK)
				4675	return -EAGAIN;
				4676
				4677	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
				4678	req->sync.flags);
				4679	if (ret < 0)
				4680	req_set_fail(req);
				4681	io_req_complete(req, ret);
				4682	return 0;
				4683	}
				4684
				4685	#if defined(CONFIG_NET)
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4686	static bool io_net_retry(struct socket *sock, int flags)
				4687	{
				4688	if (!(flags & MSG_WAITALL))
				4689	return false;
				4690	return sock->type == SOCK_STREAM \|\| sock->type == SOCK_SEQPACKET;
				4691	}
				4692
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4693	static int io_setup_async_msg(struct io_kiocb *req,
				4694	struct io_async_msghdr *kmsg)
				4695	{
				4696	struct io_async_msghdr *async_msg = req->async_data;
				4697
				4698	if (async_msg)
				4699	return -EAGAIN;
				4700	if (io_alloc_async_data(req)) {
				4701	kfree(kmsg->free_iov);
				4702	return -ENOMEM;
				4703	}
				4704	async_msg = req->async_data;
				4705	req->flags \|= REQ_F_NEED_CLEANUP;
				4706	memcpy(async_msg, kmsg, sizeof(*kmsg));
				4707	if (async_msg->msg.msg_name)
				4708	async_msg->msg.msg_name = &async_msg->addr;
				4709	/* if were using fast_iov, set it to the new one */
Stefan Metzmacher	53880f9	2022-09-29 09:39:10 +0200	[diff] [blame]	4710	if (!kmsg->free_iov) {
				4711	size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
				4712	async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
				4713	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4714
				4715	return -EAGAIN;
				4716	}
				4717
				4718	static int io_sendmsg_copy_hdr(struct io_kiocb *req,
				4719	struct io_async_msghdr *iomsg)
				4720	{
				4721	iomsg->msg.msg_name = &iomsg->addr;
				4722	iomsg->free_iov = iomsg->fast_iov;
				4723	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
				4724	req->sr_msg.msg_flags, &iomsg->free_iov);
				4725	}
				4726
				4727	static int io_sendmsg_prep_async(struct io_kiocb *req)
				4728	{
				4729	int ret;
				4730
				4731	ret = io_sendmsg_copy_hdr(req, req->async_data);
				4732	if (!ret)
				4733	req->flags \|= REQ_F_NEED_CLEANUP;
				4734	return ret;
				4735	}
				4736
				4737	static int io_sendmsg_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4738	{
				4739	struct io_sr_msg *sr = &req->sr_msg;
				4740
				4741	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4742	return -EINVAL;
				4743	if (unlikely(sqe->addr2 \|\| sqe->file_index))
				4744	return -EINVAL;
				4745	if (unlikely(sqe->addr2 \|\| sqe->file_index \|\| sqe->ioprio))
				4746	return -EINVAL;
				4747
				4748	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
				4749	sr->len = READ_ONCE(sqe->len);
				4750	sr->msg_flags = READ_ONCE(sqe->msg_flags) \| MSG_NOSIGNAL;
				4751	if (sr->msg_flags & MSG_DONTWAIT)
				4752	req->flags \|= REQ_F_NOWAIT;
				4753
				4754	#ifdef CONFIG_COMPAT
				4755	if (req->ctx->compat)
				4756	sr->msg_flags \|= MSG_CMSG_COMPAT;
				4757	#endif
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4758	sr->done_io = 0;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4759	return 0;
				4760	}
				4761
				4762	static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
				4763	{
				4764	struct io_async_msghdr iomsg, *kmsg;
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4765	struct io_sr_msg *sr = &req->sr_msg;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4766	struct socket *sock;
				4767	unsigned flags;
				4768	int min_ret = 0;
				4769	int ret;
				4770
				4771	sock = sock_from_file(req->file, &ret);
				4772	if (unlikely(!sock))
				4773	return ret;
				4774
				4775	kmsg = req->async_data;
				4776	if (!kmsg) {
				4777	ret = io_sendmsg_copy_hdr(req, &iomsg);
				4778	if (ret)
				4779	return ret;
				4780	kmsg = &iomsg;
				4781	}
				4782
				4783	flags = req->sr_msg.msg_flags;
				4784	if (issue_flags & IO_URING_F_NONBLOCK)
				4785	flags \|= MSG_DONTWAIT;
				4786	if (flags & MSG_WAITALL)
				4787	min_ret = iov_iter_count(&kmsg->msg.msg_iter);
				4788
				4789	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4790
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	4791	if (ret < min_ret) {
				4792	if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
				4793	return io_setup_async_msg(req, kmsg);
				4794	if (ret == -ERESTARTSYS)
				4795	ret = -EINTR;
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4796	if (ret > 0 && io_net_retry(sock, flags)) {
				4797	sr->done_io += ret;
				4798	req->flags \|= REQ_F_PARTIAL_IO;
				4799	return io_setup_async_msg(req, kmsg);
				4800	}
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	4801	req_set_fail(req);
				4802	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4803	/* fast path, check for non-NULL to avoid function call */
				4804	if (kmsg->free_iov)
				4805	kfree(kmsg->free_iov);
				4806	req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4807	if (ret >= 0)
				4808	ret += sr->done_io;
				4809	else if (sr->done_io)
				4810	ret = sr->done_io;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4811	__io_req_complete(req, issue_flags, ret, 0);
				4812	return 0;
				4813	}
				4814
				4815	static int io_send(struct io_kiocb *req, unsigned int issue_flags)
				4816	{
				4817	struct io_sr_msg *sr = &req->sr_msg;
				4818	struct msghdr msg;
				4819	struct iovec iov;
				4820	struct socket *sock;
				4821	unsigned flags;
				4822	int min_ret = 0;
				4823	int ret;
				4824
				4825	sock = sock_from_file(req->file, &ret);
				4826	if (unlikely(!sock))
				4827	return ret;
				4828
				4829	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
				4830	if (unlikely(ret))
				4831	return ret;
				4832
				4833	msg.msg_name = NULL;
				4834	msg.msg_control = NULL;
				4835	msg.msg_controllen = 0;
				4836	msg.msg_namelen = 0;
				4837
				4838	flags = req->sr_msg.msg_flags;
				4839	if (issue_flags & IO_URING_F_NONBLOCK)
				4840	flags \|= MSG_DONTWAIT;
				4841	if (flags & MSG_WAITALL)
				4842	min_ret = iov_iter_count(&msg.msg_iter);
				4843
				4844	msg.msg_flags = flags;
				4845	ret = sock_sendmsg(sock, &msg);
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	4846	if (ret < min_ret) {
				4847	if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
				4848	return -EAGAIN;
				4849	if (ret == -ERESTARTSYS)
				4850	ret = -EINTR;
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4851	if (ret > 0 && io_net_retry(sock, flags)) {
				4852	sr->len -= ret;
				4853	sr->buf += ret;
				4854	sr->done_io += ret;
				4855	req->flags \|= REQ_F_PARTIAL_IO;
				4856	return -EAGAIN;
				4857	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4858	req_set_fail(req);
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	4859	}
Jens Axboe	5e76649	2022-04-20 19:21:36 -0600	[diff] [blame]	4860	if (ret >= 0)
				4861	ret += sr->done_io;
				4862	else if (sr->done_io)
				4863	ret = sr->done_io;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	4864	__io_req_complete(req, issue_flags, ret, 0);
				4865	return 0;
				4866	}
				4867
				4868	static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
				4869	struct io_async_msghdr *iomsg)
				4870	{
				4871	struct io_sr_msg *sr = &req->sr_msg;
				4872	struct iovec __user *uiov;
				4873	size_t iov_len;
				4874	int ret;
				4875
				4876	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
				4877	&iomsg->uaddr, &uiov, &iov_len);
				4878	if (ret)
				4879	return ret;
				4880
				4881	if (req->flags & REQ_F_BUFFER_SELECT) {
				4882	if (iov_len > 1)
				4883	return -EINVAL;
				4884	if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
				4885	return -EFAULT;
				4886	sr->len = iomsg->fast_iov[0].iov_len;
				4887	iomsg->free_iov = NULL;
				4888	} else {
				4889	iomsg->free_iov = iomsg->fast_iov;
				4890	ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
				4891	&iomsg->free_iov, &iomsg->msg.msg_iter,
				4892	false);
				4893	if (ret > 0)
				4894	ret = 0;
				4895	}
				4896
				4897	return ret;
				4898	}
				4899
				4900	#ifdef CONFIG_COMPAT
				4901	static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
				4902	struct io_async_msghdr *iomsg)
				4903	{
				4904	struct io_sr_msg *sr = &req->sr_msg;
				4905	struct compat_iovec __user *uiov;
				4906	compat_uptr_t ptr;
				4907	compat_size_t len;
				4908	int ret;
				4909
				4910	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
				4911	&ptr, &len);
				4912	if (ret)
				4913	return ret;
				4914
				4915	uiov = compat_ptr(ptr);
				4916	if (req->flags & REQ_F_BUFFER_SELECT) {
				4917	compat_ssize_t clen;
				4918
				4919	if (len > 1)
				4920	return -EINVAL;
				4921	if (!access_ok(uiov, sizeof(*uiov)))
				4922	return -EFAULT;
				4923	if (__get_user(clen, &uiov->iov_len))
				4924	return -EFAULT;
				4925	if (clen < 0)
				4926	return -EINVAL;
				4927	sr->len = clen;
				4928	iomsg->free_iov = NULL;
				4929	} else {
				4930	iomsg->free_iov = iomsg->fast_iov;
				4931	ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
				4932	UIO_FASTIOV, &iomsg->free_iov,
				4933	&iomsg->msg.msg_iter, true);
				4934	if (ret < 0)
				4935	return ret;
				4936	}
				4937
				4938	return 0;
				4939	}
				4940	#endif
				4941
				4942	static int io_recvmsg_copy_hdr(struct io_kiocb *req,
				4943	struct io_async_msghdr *iomsg)
				4944	{
				4945	iomsg->msg.msg_name = &iomsg->addr;
				4946
				4947	#ifdef CONFIG_COMPAT
				4948	if (req->ctx->compat)
				4949	return __io_compat_recvmsg_copy_hdr(req, iomsg);
				4950	#endif
				4951
				4952	return __io_recvmsg_copy_hdr(req, iomsg);
				4953	}
				4954
				4955	static struct io_buffer io_recv_buffer_select(struct io_kiocb req,
				4956	bool needs_lock)
				4957	{
				4958	struct io_sr_msg *sr = &req->sr_msg;
				4959	struct io_buffer *kbuf;
				4960
				4961	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
				4962	if (IS_ERR(kbuf))
				4963	return kbuf;
				4964
				4965	sr->kbuf = kbuf;
				4966	req->flags \|= REQ_F_BUFFER_SELECTED;
				4967	return kbuf;
				4968	}
				4969
				4970	static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
				4971	{
				4972	return io_put_kbuf(req, req->sr_msg.kbuf);
				4973	}
				4974
				4975	static int io_recvmsg_prep_async(struct io_kiocb *req)
				4976	{
				4977	int ret;
				4978
				4979	ret = io_recvmsg_copy_hdr(req, req->async_data);
				4980	if (!ret)
				4981	req->flags \|= REQ_F_NEED_CLEANUP;
				4982	return ret;
				4983	}
				4984
				4985	static int io_recvmsg_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				4986	{
				4987	struct io_sr_msg *sr = &req->sr_msg;
				4988
				4989	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				4990	return -EINVAL;
				4991	if (unlikely(sqe->addr2 \|\| sqe->file_index))
				4992	return -EINVAL;
				4993	if (unlikely(sqe->addr2 \|\| sqe->file_index \|\| sqe->ioprio))
				4994	return -EINVAL;
				4995
				4996	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
				4997	sr->len = READ_ONCE(sqe->len);
				4998	sr->bgid = READ_ONCE(sqe->buf_group);
				4999	sr->msg_flags = READ_ONCE(sqe->msg_flags) \| MSG_NOSIGNAL;
				5000	if (sr->msg_flags & MSG_DONTWAIT)
				5001	req->flags \|= REQ_F_NOWAIT;
				5002
				5003	#ifdef CONFIG_COMPAT
				5004	if (req->ctx->compat)
				5005	sr->msg_flags \|= MSG_CMSG_COMPAT;
				5006	#endif
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5007	sr->done_io = 0;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5008	return 0;
				5009	}
				5010
				5011	static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
				5012	{
				5013	struct io_async_msghdr iomsg, *kmsg;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5014	struct io_sr_msg *sr = &req->sr_msg;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5015	struct socket *sock;
				5016	struct io_buffer *kbuf;
				5017	unsigned flags;
				5018	int min_ret = 0;
				5019	int ret, cflags = 0;
				5020	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				5021
				5022	sock = sock_from_file(req->file, &ret);
				5023	if (unlikely(!sock))
				5024	return ret;
				5025
				5026	kmsg = req->async_data;
				5027	if (!kmsg) {
				5028	ret = io_recvmsg_copy_hdr(req, &iomsg);
				5029	if (ret)
				5030	return ret;
				5031	kmsg = &iomsg;
				5032	}
				5033
				5034	if (req->flags & REQ_F_BUFFER_SELECT) {
				5035	kbuf = io_recv_buffer_select(req, !force_nonblock);
				5036	if (IS_ERR(kbuf))
				5037	return PTR_ERR(kbuf);
				5038	kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
				5039	kmsg->fast_iov[0].iov_len = req->sr_msg.len;
				5040	iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
				5041	1, req->sr_msg.len);
				5042	}
				5043
				5044	flags = req->sr_msg.msg_flags;
				5045	if (force_nonblock)
				5046	flags \|= MSG_DONTWAIT;
				5047	if (flags & MSG_WAITALL)
				5048	min_ret = iov_iter_count(&kmsg->msg.msg_iter);
				5049
				5050	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
				5051	kmsg->uaddr, flags);
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	5052	if (ret < min_ret) {
				5053	if (ret == -EAGAIN && force_nonblock)
				5054	return io_setup_async_msg(req, kmsg);
				5055	if (ret == -ERESTARTSYS)
				5056	ret = -EINTR;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5057	if (ret > 0 && io_net_retry(sock, flags)) {
				5058	sr->done_io += ret;
Jens Axboe	c7d8511	2022-03-23 09:30:05 -0600	[diff] [blame]	5059	req->flags \|= REQ_F_PARTIAL_IO;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5060	return io_setup_async_msg(req, kmsg);
				5061	}
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	5062	req_set_fail(req);
				5063	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC \| MSG_CTRUNC))) {
				5064	req_set_fail(req);
				5065	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5066
				5067	if (req->flags & REQ_F_BUFFER_SELECTED)
				5068	cflags = io_put_recv_kbuf(req);
				5069	/* fast path, check for non-NULL to avoid function call */
				5070	if (kmsg->free_iov)
				5071	kfree(kmsg->free_iov);
				5072	req->flags &= ~REQ_F_NEED_CLEANUP;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5073	if (ret >= 0)
				5074	ret += sr->done_io;
				5075	else if (sr->done_io)
				5076	ret = sr->done_io;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5077	__io_req_complete(req, issue_flags, ret, cflags);
				5078	return 0;
				5079	}
				5080
				5081	static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
				5082	{
				5083	struct io_buffer *kbuf;
				5084	struct io_sr_msg *sr = &req->sr_msg;
				5085	struct msghdr msg;
				5086	void __user *buf = sr->buf;
				5087	struct socket *sock;
				5088	struct iovec iov;
				5089	unsigned flags;
				5090	int min_ret = 0;
				5091	int ret, cflags = 0;
				5092	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				5093
				5094	sock = sock_from_file(req->file, &ret);
				5095	if (unlikely(!sock))
				5096	return ret;
				5097
				5098	if (req->flags & REQ_F_BUFFER_SELECT) {
				5099	kbuf = io_recv_buffer_select(req, !force_nonblock);
				5100	if (IS_ERR(kbuf))
				5101	return PTR_ERR(kbuf);
				5102	buf = u64_to_user_ptr(kbuf->addr);
				5103	}
				5104
				5105	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
				5106	if (unlikely(ret))
				5107	goto out_free;
				5108
				5109	msg.msg_name = NULL;
				5110	msg.msg_control = NULL;
				5111	msg.msg_controllen = 0;
				5112	msg.msg_namelen = 0;
				5113	msg.msg_iocb = NULL;
				5114	msg.msg_flags = 0;
				5115
				5116	flags = req->sr_msg.msg_flags;
				5117	if (force_nonblock)
				5118	flags \|= MSG_DONTWAIT;
				5119	if (flags & MSG_WAITALL)
				5120	min_ret = iov_iter_count(&msg.msg_iter);
				5121
				5122	ret = sock_recvmsg(sock, &msg, flags);
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	5123	if (ret < min_ret) {
				5124	if (ret == -EAGAIN && force_nonblock)
				5125	return -EAGAIN;
				5126	if (ret == -ERESTARTSYS)
				5127	ret = -EINTR;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5128	if (ret > 0 && io_net_retry(sock, flags)) {
				5129	sr->len -= ret;
				5130	sr->buf += ret;
				5131	sr->done_io += ret;
Jens Axboe	c7d8511	2022-03-23 09:30:05 -0600	[diff] [blame]	5132	req->flags \|= REQ_F_PARTIAL_IO;
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5133	return -EAGAIN;
				5134	}
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	5135	req_set_fail(req);
				5136	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC \| MSG_CTRUNC))) {
Alviro Iskandar Setiawan	c4b25ae	2022-02-07 21:05:33 +0700	[diff] [blame]	5137	out_free:
Pavel Begunkov	6ee6efe	2021-11-23 00:07:47 +0000	[diff] [blame]	5138	req_set_fail(req);
				5139	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5140	if (req->flags & REQ_F_BUFFER_SELECTED)
				5141	cflags = io_put_recv_kbuf(req);
Jens Axboe	82826a6	2023-01-21 10:21:22 -0700	[diff] [blame]	5142	if (ret >= 0)
				5143	ret += sr->done_io;
				5144	else if (sr->done_io)
				5145	ret = sr->done_io;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5146	__io_req_complete(req, issue_flags, ret, cflags);
				5147	return 0;
				5148	}
				5149
				5150	static int io_accept_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				5151	{
				5152	struct io_accept *accept = &req->accept;
				5153
				5154	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				5155	return -EINVAL;
				5156	if (sqe->ioprio \|\| sqe->len \|\| sqe->buf_index)
				5157	return -EINVAL;
				5158
				5159	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
				5160	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
				5161	accept->flags = READ_ONCE(sqe->accept_flags);
				5162	accept->nofile = rlimit(RLIMIT_NOFILE);
				5163
				5164	accept->file_slot = READ_ONCE(sqe->file_index);
				5165	if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
				5166	return -EINVAL;
				5167	if (accept->flags & ~(SOCK_CLOEXEC \| SOCK_NONBLOCK))
				5168	return -EINVAL;
				5169	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
				5170	accept->flags = (accept->flags & ~SOCK_NONBLOCK) \| O_NONBLOCK;
				5171	return 0;
				5172	}
				5173
				5174	static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
				5175	{
				5176	struct io_accept *accept = &req->accept;
				5177	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				5178	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
				5179	bool fixed = !!accept->file_slot;
				5180	struct file *file;
				5181	int ret, fd;
				5182
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5183	if (!fixed) {
				5184	fd = __get_unused_fd_flags(accept->flags, accept->nofile);
				5185	if (unlikely(fd < 0))
				5186	return fd;
				5187	}
				5188	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
				5189	accept->flags);
				5190
				5191	if (IS_ERR(file)) {
				5192	if (!fixed)
				5193	put_unused_fd(fd);
				5194	ret = PTR_ERR(file);
Dylan Yudaken	bb135266	2023-01-21 09:13:12 -0700	[diff] [blame]	5195	/* safe to retry */
				5196	req->flags \|= REQ_F_PARTIAL_IO;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5197	if (ret == -EAGAIN && force_nonblock)
				5198	return -EAGAIN;
				5199	if (ret == -ERESTARTSYS)
				5200	ret = -EINTR;
				5201	req_set_fail(req);
				5202	} else if (!fixed) {
				5203	fd_install(fd, file);
				5204	ret = fd;
				5205	} else {
				5206	ret = io_install_fixed_file(req, file, issue_flags,
				5207	accept->file_slot - 1);
				5208	}
				5209	__io_req_complete(req, issue_flags, ret, 0);
				5210	return 0;
				5211	}
				5212
				5213	static int io_connect_prep_async(struct io_kiocb *req)
				5214	{
				5215	struct io_async_connect *io = req->async_data;
				5216	struct io_connect *conn = &req->connect;
				5217
				5218	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
				5219	}
				5220
				5221	static int io_connect_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				5222	{
				5223	struct io_connect *conn = &req->connect;
				5224
				5225	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				5226	return -EINVAL;
				5227	if (sqe->ioprio \|\| sqe->len \|\| sqe->buf_index \|\| sqe->rw_flags \|\|
				5228	sqe->splice_fd_in)
				5229	return -EINVAL;
				5230
				5231	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
				5232	conn->addr_len = READ_ONCE(sqe->addr2);
				5233	return 0;
				5234	}
				5235
				5236	static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
				5237	{
				5238	struct io_async_connect __io, *io;
				5239	unsigned file_flags;
				5240	int ret;
				5241	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				5242
				5243	if (req->async_data) {
				5244	io = req->async_data;
				5245	} else {
				5246	ret = move_addr_to_kernel(req->connect.addr,
				5247	req->connect.addr_len,
				5248	&__io.address);
				5249	if (ret)
				5250	goto out;
				5251	io = &__io;
				5252	}
				5253
				5254	file_flags = force_nonblock ? O_NONBLOCK : 0;
				5255
				5256	ret = __sys_connect_file(req->file, &io->address,
				5257	req->connect.addr_len, file_flags);
				5258	if ((ret == -EAGAIN \|\| ret == -EINPROGRESS) && force_nonblock) {
				5259	if (req->async_data)
				5260	return -EAGAIN;
				5261	if (io_alloc_async_data(req)) {
				5262	ret = -ENOMEM;
				5263	goto out;
				5264	}
				5265	memcpy(req->async_data, &__io, sizeof(__io));
				5266	return -EAGAIN;
				5267	}
				5268	if (ret == -ERESTARTSYS)
				5269	ret = -EINTR;
				5270	out:
				5271	if (ret < 0)
				5272	req_set_fail(req);
				5273	__io_req_complete(req, issue_flags, ret, 0);
				5274	return 0;
				5275	}
				5276	#else /* !CONFIG_NET */
				5277	#define IO_NETOP_FN(op) \
				5278	static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
				5279	{ \
				5280	return -EOPNOTSUPP; \
				5281	}
				5282
				5283	#define IO_NETOP_PREP(op) \
				5284	IO_NETOP_FN(op) \
				5285	static int io_##op##_prep(struct io_kiocb req, const struct io_uring_sqe sqe) \
				5286	{ \
				5287	return -EOPNOTSUPP; \
				5288	} \
				5289
				5290	#define IO_NETOP_PREP_ASYNC(op) \
				5291	IO_NETOP_PREP(op) \
				5292	static int io_##op##_prep_async(struct io_kiocb *req) \
				5293	{ \
				5294	return -EOPNOTSUPP; \
				5295	}
				5296
				5297	IO_NETOP_PREP_ASYNC(sendmsg);
				5298	IO_NETOP_PREP_ASYNC(recvmsg);
				5299	IO_NETOP_PREP_ASYNC(connect);
				5300	IO_NETOP_PREP(accept);
				5301	IO_NETOP_FN(send);
				5302	IO_NETOP_FN(recv);
				5303	#endif /* CONFIG_NET */
				5304
				5305	struct io_poll_table {
				5306	struct poll_table_struct pt;
				5307	struct io_kiocb *req;
				5308	int nr_entries;
				5309	int error;
				5310	};
				5311
				5312	#define IO_POLL_CANCEL_FLAG BIT(31)
				5313	#define IO_POLL_RETRY_FLAG BIT(30)
				5314	#define IO_POLL_REF_MASK GENMASK(29, 0)
				5315
				5316	/*
				5317	* We usually have 1-2 refs taken, 128 is more than enough and we want to
				5318	* maximise the margin between this amount and the moment when it overflows.
				5319	*/
				5320	#define IO_POLL_REF_BIAS 128
				5321
				5322	static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
				5323	{
				5324	int v;
				5325
				5326	/*
				5327	* poll_refs are already elevated and we don't have much hope for
				5328	* grabbing the ownership. Instead of incrementing set a retry flag
				5329	* to notify the loop that there might have been some change.
				5330	*/
				5331	v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
				5332	if (v & IO_POLL_REF_MASK)
				5333	return false;
				5334	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
				5335	}
				5336
				5337	/*
				5338	* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
				5339	* bump it and acquire ownership. It's disallowed to modify requests while not
				5340	* owning it, that prevents from races for enqueueing task_work's and b/w
				5341	* arming poll and wakeups.
				5342	*/
				5343	static inline bool io_poll_get_ownership(struct io_kiocb *req)
				5344	{
				5345	if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
				5346	return io_poll_get_ownership_slowpath(req);
				5347	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
				5348	}
				5349
				5350	static void io_poll_mark_cancelled(struct io_kiocb *req)
				5351	{
				5352	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
				5353	}
				5354
				5355	static struct io_poll_iocb io_poll_get_double(struct io_kiocb req)
				5356	{
				5357	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
				5358	if (req->opcode == IORING_OP_POLL_ADD)
				5359	return req->async_data;
				5360	return req->apoll->double_poll;
				5361	}
				5362
				5363	static struct io_poll_iocb io_poll_get_single(struct io_kiocb req)
				5364	{
				5365	if (req->opcode == IORING_OP_POLL_ADD)
				5366	return &req->poll;
				5367	return &req->apoll->poll;
				5368	}
				5369
				5370	static void io_poll_req_insert(struct io_kiocb *req)
				5371	{
				5372	struct io_ring_ctx *ctx = req->ctx;
				5373	struct hlist_head *list;
				5374
				5375	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
				5376	hlist_add_head(&req->hash_node, list);
				5377	}
				5378
				5379	static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
				5380	wait_queue_func_t wake_func)
				5381	{
				5382	poll->head = NULL;
				5383	#define IO_POLL_UNMASK (EPOLLERR\|EPOLLHUP\|EPOLLNVAL\|EPOLLRDHUP)
				5384	/* mask in events that we always want/need */
				5385	poll->events = events \| IO_POLL_UNMASK;
				5386	INIT_LIST_HEAD(&poll->wait.entry);
				5387	init_waitqueue_func_entry(&poll->wait, wake_func);
				5388	}
				5389
				5390	static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
				5391	{
				5392	struct wait_queue_head *head = smp_load_acquire(&poll->head);
				5393
				5394	if (head) {
				5395	spin_lock_irq(&head->lock);
				5396	list_del_init(&poll->wait.entry);
				5397	poll->head = NULL;
				5398	spin_unlock_irq(&head->lock);
				5399	}
				5400	}
				5401
				5402	static void io_poll_remove_entries(struct io_kiocb *req)
				5403	{
				5404	struct io_poll_iocb *poll = io_poll_get_single(req);
				5405	struct io_poll_iocb *poll_double = io_poll_get_double(req);
				5406
				5407	/*
				5408	* While we hold the waitqueue lock and the waitqueue is nonempty,
				5409	* wake_up_pollfree() will wait for us. However, taking the waitqueue
				5410	* lock in the first place can race with the waitqueue being freed.
				5411	*
				5412	* We solve this as eventpoll does: by taking advantage of the fact that
				5413	* all users of wake_up_pollfree() will RCU-delay the actual free. If
				5414	* we enter rcu_read_lock() and see that the pointer to the queue is
				5415	* non-NULL, we can then lock it without the memory being freed out from
				5416	* under us.
				5417	*
				5418	* Keep holding rcu_read_lock() as long as we hold the queue lock, in
				5419	* case the caller deletes the entry from the queue, leaving it empty.
				5420	* In that case, only RCU prevents the queue memory from being freed.
				5421	*/
				5422	rcu_read_lock();
				5423	io_poll_remove_entry(poll);
				5424	if (poll_double)
				5425	io_poll_remove_entry(poll_double);
				5426	rcu_read_unlock();
				5427	}
				5428
				5429	/*
				5430	* All poll tw should go through this. Checks for poll events, manages
				5431	* references, does rewait, etc.
				5432	*
				5433	* Returns a negative error on failure. >0 when no action require, which is
				5434	* either spurious wakeup or multishot CQE is served. 0 when it's done with
				5435	* the request, then the mask is stored in req->result.
				5436	*/
				5437	static int io_poll_check_events(struct io_kiocb *req)
				5438	{
				5439	struct io_ring_ctx *ctx = req->ctx;
				5440	struct io_poll_iocb *poll = io_poll_get_single(req);
				5441	int v;
				5442
				5443	/* req->task == current here, checking PF_EXITING is safe */
				5444	if (unlikely(req->task->flags & PF_EXITING))
				5445	io_poll_mark_cancelled(req);
				5446
				5447	do {
				5448	v = atomic_read(&req->poll_refs);
				5449
				5450	/* tw handler should be the owner, and so have some references */
				5451	if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
				5452	return 0;
				5453	if (v & IO_POLL_CANCEL_FLAG)
				5454	return -ECANCELED;
				5455	/*
				5456	* cqe.res contains only events of the first wake up
				5457	* and all others are be lost. Redo vfs_poll() to get
				5458	* up to date state.
				5459	*/
				5460	if ((v & IO_POLL_REF_MASK) != 1)
				5461	req->result = 0;
				5462	if (v & IO_POLL_RETRY_FLAG) {
				5463	req->result = 0;
				5464	/*
				5465	* We won't find new events that came in between
				5466	* vfs_poll and the ref put unless we clear the
				5467	* flag in advance.
				5468	*/
				5469	atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
				5470	v &= ~IO_POLL_RETRY_FLAG;
				5471	}
				5472
				5473	if (!req->result) {
				5474	struct poll_table_struct pt = { ._key = poll->events };
				5475
				5476	req->result = vfs_poll(req->file, &pt) & poll->events;
				5477	}
				5478
				5479	/* multishot, just fill an CQE and proceed */
				5480	if (req->result && !(poll->events & EPOLLONESHOT)) {
				5481	__poll_t mask = mangle_poll(req->result & poll->events);
				5482	bool filled;
				5483
				5484	spin_lock(&ctx->completion_lock);
				5485	filled = io_fill_cqe_aux(ctx, req->user_data, mask,
				5486	IORING_CQE_F_MORE);
				5487	io_commit_cqring(ctx);
				5488	spin_unlock(&ctx->completion_lock);
				5489	if (unlikely(!filled))
				5490	return -ECANCELED;
				5491	io_cqring_ev_posted(ctx);
				5492	} else if (req->result) {
				5493	return 0;
				5494	}
				5495
				5496	/* force the next iteration to vfs_poll() */
				5497	req->result = 0;
				5498
				5499	/*
				5500	* Release all references, retry if someone tried to restart
				5501	* task_work while we were executing it.
				5502	*/
				5503	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
				5504	IO_POLL_REF_MASK);
				5505
				5506	return 1;
				5507	}
				5508
				5509	static void io_poll_task_func(struct io_kiocb req, bool locked)
				5510	{
				5511	struct io_ring_ctx *ctx = req->ctx;
				5512	int ret;
				5513
				5514	ret = io_poll_check_events(req);
				5515	if (ret > 0)
				5516	return;
				5517
				5518	if (!ret) {
				5519	req->result = mangle_poll(req->result & req->poll.events);
				5520	} else {
				5521	req->result = ret;
				5522	req_set_fail(req);
				5523	}
				5524
				5525	io_poll_remove_entries(req);
				5526	spin_lock(&ctx->completion_lock);
				5527	hash_del(&req->hash_node);
				5528	spin_unlock(&ctx->completion_lock);
				5529	io_req_complete_post(req, req->result, 0);
				5530	}
				5531
				5532	static void io_apoll_task_func(struct io_kiocb req, bool locked)
				5533	{
				5534	struct io_ring_ctx *ctx = req->ctx;
				5535	int ret;
				5536
				5537	ret = io_poll_check_events(req);
				5538	if (ret > 0)
				5539	return;
				5540
				5541	io_poll_remove_entries(req);
				5542	spin_lock(&ctx->completion_lock);
				5543	hash_del(&req->hash_node);
				5544	spin_unlock(&ctx->completion_lock);
				5545
				5546	if (!ret)
				5547	io_req_task_submit(req, locked);
				5548	else
				5549	io_req_complete_failed(req, ret);
				5550	}
				5551
				5552	static void __io_poll_execute(struct io_kiocb *req, int mask)
				5553	{
				5554	req->result = mask;
				5555	if (req->opcode == IORING_OP_POLL_ADD)
				5556	req->io_task_work.func = io_poll_task_func;
				5557	else
				5558	req->io_task_work.func = io_apoll_task_func;
				5559
				5560	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
				5561	io_req_task_work_add(req);
				5562	}
				5563
				5564	static inline void io_poll_execute(struct io_kiocb *req, int res)
				5565	{
				5566	if (io_poll_get_ownership(req))
				5567	__io_poll_execute(req, res);
				5568	}
				5569
				5570	static void io_poll_cancel_req(struct io_kiocb *req)
				5571	{
				5572	io_poll_mark_cancelled(req);
				5573	/* kick tw, which should complete the request */
				5574	io_poll_execute(req, 0);
				5575	}
				5576
				5577	static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
				5578	void *key)
				5579	{
				5580	struct io_kiocb *req = wait->private;
				5581	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
				5582	wait);
				5583	__poll_t mask = key_to_poll(key);
				5584
				5585	if (unlikely(mask & POLLFREE)) {
				5586	io_poll_mark_cancelled(req);
				5587	/* we have to kick tw in case it's not already */
				5588	io_poll_execute(req, 0);
				5589
				5590	/*
				5591	* If the waitqueue is being freed early but someone is already
				5592	* holds ownership over it, we have to tear down the request as
				5593	* best we can. That means immediately removing the request from
				5594	* its waitqueue and preventing all further accesses to the
				5595	* waitqueue via the request.
				5596	*/
				5597	list_del_init(&poll->wait.entry);
				5598
				5599	/*
				5600	* Careful: this must be the last step, since as soon
				5601	* as req->head is NULL'ed out, the request can be
				5602	* completed and freed, since aio_poll_complete_work()
				5603	* will no longer need to take the waitqueue lock.
				5604	*/
				5605	smp_store_release(&poll->head, NULL);
				5606	return 1;
				5607	}
				5608
				5609	/* for instances that support it check for an event match first */
				5610	if (mask && !(mask & poll->events))
				5611	return 0;
				5612
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	5613	if (io_poll_get_ownership(req)) {
				5614	/*
				5615	* If we trigger a multishot poll off our own wakeup path,
				5616	* disable multishot as there is a circular dependency between
				5617	* CQ posting and triggering the event.
				5618	*/
				5619	if (mask & EPOLL_URING_WAKE)
				5620	poll->events \|= EPOLLONESHOT;
				5621
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5622	__io_poll_execute(req, mask);
Jens Axboe	b52fdbc	2022-12-23 07:04:49 -0700	[diff] [blame]	5623	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5624	return 1;
				5625	}
				5626
				5627	static void __io_queue_proc(struct io_poll_iocb poll, struct io_poll_table pt,
				5628	struct wait_queue_head *head,
				5629	struct io_poll_iocb **poll_ptr)
				5630	{
				5631	struct io_kiocb *req = pt->req;
				5632
				5633	/*
				5634	* The file being polled uses multiple waitqueues for poll handling
				5635	* (e.g. one for read, one for write). Setup a separate io_poll_iocb
				5636	* if this happens.
				5637	*/
				5638	if (unlikely(pt->nr_entries)) {
				5639	struct io_poll_iocb *first = poll;
				5640
				5641	/* double add on the same waitqueue head, ignore */
				5642	if (first->head == head)
				5643	return;
				5644	/* already have a 2nd entry, fail a third attempt */
				5645	if (*poll_ptr) {
				5646	if ((*poll_ptr)->head == head)
				5647	return;
				5648	pt->error = -EINVAL;
				5649	return;
				5650	}
				5651
				5652	poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
				5653	if (!poll) {
				5654	pt->error = -ENOMEM;
				5655	return;
				5656	}
				5657	io_init_poll_iocb(poll, first->events, first->wait.func);
				5658	*poll_ptr = poll;
				5659	}
				5660
				5661	pt->nr_entries++;
				5662	poll->head = head;
				5663	poll->wait.private = req;
				5664
				5665	if (poll->events & EPOLLEXCLUSIVE)
				5666	add_wait_queue_exclusive(head, &poll->wait);
				5667	else
				5668	add_wait_queue(head, &poll->wait);
				5669	}
				5670
				5671	static void io_poll_queue_proc(struct file file, struct wait_queue_head head,
				5672	struct poll_table_struct *p)
				5673	{
				5674	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
				5675
				5676	__io_queue_proc(&pt->req->poll, pt, head,
				5677	(struct io_poll_iocb **) &pt->req->async_data);
				5678	}
				5679
				5680	static int __io_arm_poll_handler(struct io_kiocb *req,
				5681	struct io_poll_iocb *poll,
				5682	struct io_poll_table *ipt, __poll_t mask)
				5683	{
				5684	struct io_ring_ctx *ctx = req->ctx;
				5685
				5686	INIT_HLIST_NODE(&req->hash_node);
				5687	io_init_poll_iocb(poll, mask, io_poll_wake);
				5688	poll->file = req->file;
				5689	poll->wait.private = req;
				5690
				5691	ipt->pt._key = mask;
				5692	ipt->req = req;
				5693	ipt->error = 0;
				5694	ipt->nr_entries = 0;
				5695
				5696	/*
				5697	* Take the ownership to delay any tw execution up until we're done
				5698	* with poll arming. see io_poll_get_ownership().
				5699	*/
				5700	atomic_set(&req->poll_refs, 1);
				5701	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
				5702
				5703	if (mask && (poll->events & EPOLLONESHOT)) {
				5704	io_poll_remove_entries(req);
				5705	/* no one else has access to the req, forget about the ref */
				5706	return mask;
				5707	}
				5708	if (!mask && unlikely(ipt->error \|\| !ipt->nr_entries)) {
				5709	io_poll_remove_entries(req);
				5710	if (!ipt->error)
				5711	ipt->error = -EINVAL;
				5712	return 0;
				5713	}
				5714
				5715	spin_lock(&ctx->completion_lock);
				5716	io_poll_req_insert(req);
				5717	spin_unlock(&ctx->completion_lock);
				5718
				5719	if (mask) {
				5720	/* can't multishot if failed, just queue the event we've got */
				5721	if (unlikely(ipt->error \|\| !ipt->nr_entries)) {
				5722	poll->events \|= EPOLLONESHOT;
				5723	ipt->error = 0;
				5724	}
				5725	__io_poll_execute(req, mask);
				5726	return 0;
				5727	}
				5728
				5729	/*
				5730	* Try to release ownership. If we see a change of state, e.g.
				5731	* poll was waken up, queue up a tw, it'll deal with it.
				5732	*/
				5733	if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
				5734	__io_poll_execute(req, 0);
				5735	return 0;
				5736	}
				5737
				5738	static void io_async_queue_proc(struct file file, struct wait_queue_head head,
				5739	struct poll_table_struct *p)
				5740	{
				5741	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
				5742	struct async_poll *apoll = pt->req->apoll;
				5743
				5744	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
				5745	}
				5746
				5747	enum {
				5748	IO_APOLL_OK,
				5749	IO_APOLL_ABORTED,
				5750	IO_APOLL_READY
				5751	};
				5752
				5753	static int io_arm_poll_handler(struct io_kiocb *req)
				5754	{
				5755	const struct io_op_def *def = &io_op_defs[req->opcode];
				5756	struct io_ring_ctx *ctx = req->ctx;
				5757	struct async_poll *apoll;
				5758	struct io_poll_table ipt;
				5759	__poll_t mask = EPOLLONESHOT \| POLLERR \| POLLPRI;
				5760	int ret;
				5761
				5762	if (!req->file \|\| !file_can_poll(req->file))
				5763	return IO_APOLL_ABORTED;
Jens Axboe	7ac6f09	2023-01-21 10:39:22 -0700	[diff] [blame]	5764	if ((req->flags & (REQ_F_POLLED\|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5765	return IO_APOLL_ABORTED;
				5766	if (!def->pollin && !def->pollout)
				5767	return IO_APOLL_ABORTED;
				5768
				5769	if (def->pollin) {
				5770	mask \|= POLLIN \| POLLRDNORM;
				5771
				5772	/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
				5773	if ((req->opcode == IORING_OP_RECVMSG) &&
				5774	(req->sr_msg.msg_flags & MSG_ERRQUEUE))
				5775	mask &= ~POLLIN;
				5776	} else {
				5777	mask \|= POLLOUT \| POLLWRNORM;
				5778	}
				5779
Pavel Begunkov	321383f	2023-01-22 10:24:20 -0700	[diff] [blame]	5780	if (req->flags & REQ_F_POLLED) {
Jens Axboe	7ac6f09	2023-01-21 10:39:22 -0700	[diff] [blame]	5781	apoll = req->apoll;
Pavel Begunkov	321383f	2023-01-22 10:24:20 -0700	[diff] [blame]	5782	kfree(apoll->double_poll);
				5783	} else {
Jens Axboe	7ac6f09	2023-01-21 10:39:22 -0700	[diff] [blame]	5784	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
Pavel Begunkov	321383f	2023-01-22 10:24:20 -0700	[diff] [blame]	5785	}
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	5786	if (unlikely(!apoll))
				5787	return IO_APOLL_ABORTED;
				5788	apoll->double_poll = NULL;
				5789	req->apoll = apoll;
				5790	req->flags \|= REQ_F_POLLED;
				5791	ipt.pt._qproc = io_async_queue_proc;
				5792
				5793	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
				5794	if (ret \|\| ipt.error)
				5795	return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
				5796
				5797	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
				5798	mask, apoll->poll.events);
				5799	return IO_APOLL_OK;
				5800	}
				5801
				5802	/*
				5803	* Returns true if we found and killed one or more poll requests
				5804	*/
				5805	static bool io_poll_remove_all(struct io_ring_ctx ctx, struct task_struct tsk,
				5806	bool cancel_all)
				5807	{
				5808	struct hlist_node *tmp;
				5809	struct io_kiocb *req;
				5810	bool found = false;
				5811	int i;
				5812
				5813	spin_lock(&ctx->completion_lock);
				5814	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
				5815	struct hlist_head *list;
				5816
				5817	list = &ctx->cancel_hash[i];
				5818	hlist_for_each_entry_safe(req, tmp, list, hash_node) {
				5819	if (io_match_task_safe(req, tsk, cancel_all)) {
				5820	hlist_del_init(&req->hash_node);
				5821	io_poll_cancel_req(req);
				5822	found = true;
				5823	}
				5824	}
				5825	}
				5826	spin_unlock(&ctx->completion_lock);
				5827	return found;
				5828	}
				5829
				5830	static struct io_kiocb io_poll_find(struct io_ring_ctx ctx, __u64 sqe_addr,
				5831	bool poll_only)
				5832	__must_hold(&ctx->completion_lock)
				5833	{
				5834	struct hlist_head *list;
				5835	struct io_kiocb *req;
				5836
				5837	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
				5838	hlist_for_each_entry(req, list, hash_node) {
				5839	if (sqe_addr != req->user_data)
				5840	continue;
				5841	if (poll_only && req->opcode != IORING_OP_POLL_ADD)
				5842	continue;
				5843	return req;
				5844	}
				5845	return NULL;
				5846	}
				5847
				5848	static bool io_poll_disarm(struct io_kiocb *req)
				5849	__must_hold(&ctx->completion_lock)
				5850	{
				5851	if (!io_poll_get_ownership(req))
				5852	return false;
				5853	io_poll_remove_entries(req);
				5854	hash_del(&req->hash_node);
				5855	return true;
				5856	}
				5857
				5858	static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
				5859	bool poll_only)
				5860	__must_hold(&ctx->completion_lock)
				5861	{
				5862	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
				5863
				5864	if (!req)
				5865	return -ENOENT;
				5866	io_poll_cancel_req(req);
				5867	return 0;
				5868	}
				5869
				5870	static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
				5871	unsigned int flags)
				5872	{
				5873	u32 events;
				5874
				5875	events = READ_ONCE(sqe->poll32_events);
				5876	#ifdef __BIG_ENDIAN
				5877	events = swahw32(events);
				5878	#endif
				5879	if (!(flags & IORING_POLL_ADD_MULTI))
				5880	events \|= EPOLLONESHOT;
				5881	return demangle_poll(events) \| (events & (EPOLLEXCLUSIVE\|EPOLLONESHOT));
				5882	}
				5883
				5884	static int io_poll_update_prep(struct io_kiocb *req,
				5885	const struct io_uring_sqe *sqe)
				5886	{
				5887	struct io_poll_update *upd = &req->poll_update;
				5888	u32 flags;
				5889
				5890	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				5891	return -EINVAL;
				5892	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->splice_fd_in)
				5893	return -EINVAL;
				5894	flags = READ_ONCE(sqe->len);
				5895	if (flags & ~(IORING_POLL_UPDATE_EVENTS \| IORING_POLL_UPDATE_USER_DATA \|
				5896	IORING_POLL_ADD_MULTI))
				5897	return -EINVAL;
				5898	/* meaningless without update */
				5899	if (flags == IORING_POLL_ADD_MULTI)
				5900	return -EINVAL;
				5901
				5902	upd->old_user_data = READ_ONCE(sqe->addr);
				5903	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
				5904	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
				5905
				5906	upd->new_user_data = READ_ONCE(sqe->off);
				5907	if (!upd->update_user_data && upd->new_user_data)
				5908	return -EINVAL;
				5909	if (upd->update_events)
				5910	upd->events = io_poll_parse_events(sqe, flags);
				5911	else if (sqe->poll32_events)
				5912	return -EINVAL;
				5913
				5914	return 0;
				5915	}
				5916
				5917	static int io_poll_add_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				5918	{
				5919	struct io_poll_iocb *poll = &req->poll;
				5920	u32 flags;
				5921
				5922	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				5923	return -EINVAL;
				5924	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->off \|\| sqe->addr)
				5925	return -EINVAL;
				5926	flags = READ_ONCE(sqe->len);
				5927	if (flags & ~IORING_POLL_ADD_MULTI)
				5928	return -EINVAL;
				5929
				5930	io_req_set_refcount(req);
				5931	poll->events = io_poll_parse_events(sqe, flags);
				5932	return 0;
				5933	}
				5934
				5935	static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
				5936	{
				5937	struct io_poll_iocb *poll = &req->poll;
				5938	struct io_poll_table ipt;
				5939	int ret;
				5940
				5941	ipt.pt._qproc = io_poll_queue_proc;
				5942
				5943	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
				5944	if (!ret && ipt.error)
				5945	req_set_fail(req);
				5946	ret = ret ?: ipt.error;
				5947	if (ret)
				5948	__io_req_complete(req, issue_flags, ret, 0);
				5949	return 0;
				5950	}
				5951
				5952	static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
				5953	{
				5954	struct io_ring_ctx *ctx = req->ctx;
				5955	struct io_kiocb *preq;
				5956	int ret2, ret = 0;
				5957
				5958	spin_lock(&ctx->completion_lock);
				5959	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
				5960	if (!preq \|\| !io_poll_disarm(preq)) {
				5961	spin_unlock(&ctx->completion_lock);
				5962	ret = preq ? -EALREADY : -ENOENT;
				5963	goto out;
				5964	}
				5965	spin_unlock(&ctx->completion_lock);
				5966
				5967	if (req->poll_update.update_events \|\| req->poll_update.update_user_data) {
				5968	/* only mask one event flags, keep behavior flags */
				5969	if (req->poll_update.update_events) {
				5970	preq->poll.events &= ~0xffff;
				5971	preq->poll.events \|= req->poll_update.events & 0xffff;
				5972	preq->poll.events \|= IO_POLL_UNMASK;
				5973	}
				5974	if (req->poll_update.update_user_data)
				5975	preq->user_data = req->poll_update.new_user_data;
				5976
				5977	ret2 = io_poll_add(preq, issue_flags);
				5978	/* successfully updated, don't complete poll request */
				5979	if (!ret2)
				5980	goto out;
				5981	}
				5982	req_set_fail(preq);
				5983	io_req_complete(preq, -ECANCELED);
				5984	out:
				5985	if (ret < 0)
				5986	req_set_fail(req);
				5987	/* complete update request, we're done with it */
				5988	io_req_complete(req, ret);
				5989	return 0;
				5990	}
				5991
				5992	static void io_req_task_timeout(struct io_kiocb req, bool locked)
				5993	{
				5994	req_set_fail(req);
				5995	io_req_complete_post(req, -ETIME, 0);
				5996	}
				5997
				5998	static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
				5999	{
				6000	struct io_timeout_data *data = container_of(timer,
				6001	struct io_timeout_data, timer);
				6002	struct io_kiocb *req = data->req;
				6003	struct io_ring_ctx *ctx = req->ctx;
				6004	unsigned long flags;
				6005
				6006	spin_lock_irqsave(&ctx->timeout_lock, flags);
				6007	list_del_init(&req->timeout.list);
				6008	atomic_set(&req->ctx->cq_timeouts,
				6009	atomic_read(&req->ctx->cq_timeouts) + 1);
				6010	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
				6011
				6012	req->io_task_work.func = io_req_task_timeout;
				6013	io_req_task_work_add(req);
				6014	return HRTIMER_NORESTART;
				6015	}
				6016
				6017	static struct io_kiocb io_timeout_extract(struct io_ring_ctx ctx,
				6018	__u64 user_data)
				6019	__must_hold(&ctx->timeout_lock)
				6020	{
				6021	struct io_timeout_data *io;
				6022	struct io_kiocb *req;
				6023	bool found = false;
				6024
				6025	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
				6026	found = user_data == req->user_data;
				6027	if (found)
				6028	break;
				6029	}
				6030	if (!found)
				6031	return ERR_PTR(-ENOENT);
				6032
				6033	io = req->async_data;
				6034	if (hrtimer_try_to_cancel(&io->timer) == -1)
				6035	return ERR_PTR(-EALREADY);
				6036	list_del_init(&req->timeout.list);
				6037	return req;
				6038	}
				6039
				6040	static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
				6041	__must_hold(&ctx->completion_lock)
				6042	__must_hold(&ctx->timeout_lock)
				6043	{
				6044	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
				6045
				6046	if (IS_ERR(req))
				6047	return PTR_ERR(req);
				6048
				6049	req_set_fail(req);
				6050	io_fill_cqe_req(req, -ECANCELED, 0);
				6051	io_put_req_deferred(req);
				6052	return 0;
				6053	}
				6054
				6055	static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
				6056	{
				6057	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
				6058	case IORING_TIMEOUT_BOOTTIME:
				6059	return CLOCK_BOOTTIME;
				6060	case IORING_TIMEOUT_REALTIME:
				6061	return CLOCK_REALTIME;
				6062	default:
				6063	/* can't happen, vetted at prep time */
				6064	WARN_ON_ONCE(1);
				6065	fallthrough;
				6066	case 0:
				6067	return CLOCK_MONOTONIC;
				6068	}
				6069	}
				6070
				6071	static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
				6072	struct timespec64 *ts, enum hrtimer_mode mode)
				6073	__must_hold(&ctx->timeout_lock)
				6074	{
				6075	struct io_timeout_data *io;
				6076	struct io_kiocb *req;
				6077	bool found = false;
				6078
				6079	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
				6080	found = user_data == req->user_data;
				6081	if (found)
				6082	break;
				6083	}
				6084	if (!found)
				6085	return -ENOENT;
				6086
				6087	io = req->async_data;
				6088	if (hrtimer_try_to_cancel(&io->timer) == -1)
				6089	return -EALREADY;
				6090	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
				6091	io->timer.function = io_link_timeout_fn;
				6092	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
				6093	return 0;
				6094	}
				6095
				6096	static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
				6097	struct timespec64 *ts, enum hrtimer_mode mode)
				6098	__must_hold(&ctx->timeout_lock)
				6099	{
				6100	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
				6101	struct io_timeout_data *data;
				6102
				6103	if (IS_ERR(req))
				6104	return PTR_ERR(req);
				6105
				6106	req->timeout.off = 0; /* noseq */
				6107	data = req->async_data;
				6108	list_add_tail(&req->timeout.list, &ctx->timeout_list);
				6109	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
				6110	data->timer.function = io_timeout_fn;
				6111	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
				6112	return 0;
				6113	}
				6114
				6115	static int io_timeout_remove_prep(struct io_kiocb *req,
				6116	const struct io_uring_sqe *sqe)
				6117	{
				6118	struct io_timeout_rem *tr = &req->timeout_rem;
				6119
				6120	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				6121	return -EINVAL;
				6122	if (unlikely(req->flags & (REQ_F_FIXED_FILE \| REQ_F_BUFFER_SELECT)))
				6123	return -EINVAL;
				6124	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->len \|\| sqe->splice_fd_in)
				6125	return -EINVAL;
				6126
				6127	tr->ltimeout = false;
				6128	tr->addr = READ_ONCE(sqe->addr);
				6129	tr->flags = READ_ONCE(sqe->timeout_flags);
				6130	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
				6131	if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
				6132	return -EINVAL;
				6133	if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
				6134	tr->ltimeout = true;
				6135	if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK\|IORING_TIMEOUT_ABS))
				6136	return -EINVAL;
				6137	if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
				6138	return -EFAULT;
				6139	} else if (tr->flags) {
				6140	/* timeout removal doesn't support flags */
				6141	return -EINVAL;
				6142	}
				6143
				6144	return 0;
				6145	}
				6146
				6147	static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
				6148	{
				6149	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
				6150	: HRTIMER_MODE_REL;
				6151	}
				6152
				6153	/*
				6154	* Remove or update an existing timeout command
				6155	*/
				6156	static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
				6157	{
				6158	struct io_timeout_rem *tr = &req->timeout_rem;
				6159	struct io_ring_ctx *ctx = req->ctx;
				6160	int ret;
				6161
				6162	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
				6163	spin_lock(&ctx->completion_lock);
				6164	spin_lock_irq(&ctx->timeout_lock);
				6165	ret = io_timeout_cancel(ctx, tr->addr);
				6166	spin_unlock_irq(&ctx->timeout_lock);
				6167	spin_unlock(&ctx->completion_lock);
				6168	} else {
				6169	enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
				6170
				6171	spin_lock_irq(&ctx->timeout_lock);
				6172	if (tr->ltimeout)
				6173	ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
				6174	else
				6175	ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
				6176	spin_unlock_irq(&ctx->timeout_lock);
				6177	}
				6178
				6179	if (ret < 0)
				6180	req_set_fail(req);
				6181	io_req_complete_post(req, ret, 0);
				6182	return 0;
				6183	}
				6184
				6185	static int io_timeout_prep(struct io_kiocb req, const struct io_uring_sqe sqe,
				6186	bool is_timeout_link)
				6187	{
				6188	struct io_timeout_data *data;
				6189	unsigned flags;
				6190	u32 off = READ_ONCE(sqe->off);
				6191
				6192	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				6193	return -EINVAL;
				6194	if (sqe->ioprio \|\| sqe->buf_index \|\| sqe->len != 1 \|\|
				6195	sqe->splice_fd_in)
				6196	return -EINVAL;
				6197	if (off && is_timeout_link)
				6198	return -EINVAL;
				6199	flags = READ_ONCE(sqe->timeout_flags);
				6200	if (flags & ~(IORING_TIMEOUT_ABS \| IORING_TIMEOUT_CLOCK_MASK))
				6201	return -EINVAL;
				6202	/* more than one clock specified is invalid, obviously */
				6203	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
				6204	return -EINVAL;
				6205
				6206	INIT_LIST_HEAD(&req->timeout.list);
				6207	req->timeout.off = off;
				6208	if (unlikely(off && !req->ctx->off_timeout_used))
				6209	req->ctx->off_timeout_used = true;
				6210
				6211	if (!req->async_data && io_alloc_async_data(req))
				6212	return -ENOMEM;
				6213
				6214	data = req->async_data;
				6215	data->req = req;
				6216	data->flags = flags;
				6217
				6218	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
				6219	return -EFAULT;
				6220
				6221	INIT_LIST_HEAD(&req->timeout.list);
				6222	data->mode = io_translate_timeout_mode(flags);
				6223	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
				6224
				6225	if (is_timeout_link) {
				6226	struct io_submit_link *link = &req->ctx->submit_state.link;
				6227
				6228	if (!link->head)
				6229	return -EINVAL;
				6230	if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
				6231	return -EINVAL;
				6232	req->timeout.head = link->last;
				6233	link->last->flags \|= REQ_F_ARM_LTIMEOUT;
				6234	}
				6235	return 0;
				6236	}
				6237
				6238	static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
				6239	{
				6240	struct io_ring_ctx *ctx = req->ctx;
				6241	struct io_timeout_data *data = req->async_data;
				6242	struct list_head *entry;
				6243	u32 tail, off = req->timeout.off;
				6244
				6245	spin_lock_irq(&ctx->timeout_lock);
				6246
				6247	/*
				6248	* sqe->off holds how many events that need to occur for this
				6249	* timeout event to be satisfied. If it isn't set, then this is
				6250	* a pure timeout request, sequence isn't used.
				6251	*/
				6252	if (io_is_timeout_noseq(req)) {
				6253	entry = ctx->timeout_list.prev;
				6254	goto add;
				6255	}
				6256
				6257	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
				6258	req->timeout.target_seq = tail + off;
				6259
				6260	/* Update the last seq here in case io_flush_timeouts() hasn't.
				6261	* This is safe because ->completion_lock is held, and submissions
				6262	* and completions are never mixed in the same ->completion_lock section.
				6263	*/
				6264	ctx->cq_last_tm_flush = tail;
				6265
				6266	/*
				6267	* Insertion sort, ensuring the first entry in the list is always
				6268	* the one we need first.
				6269	*/
				6270	list_for_each_prev(entry, &ctx->timeout_list) {
				6271	struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
				6272	timeout.list);
				6273
				6274	if (io_is_timeout_noseq(nxt))
				6275	continue;
				6276	/* nxt.seq is behind @tail, otherwise would've been completed */
				6277	if (off >= nxt->timeout.target_seq - tail)
				6278	break;
				6279	}
				6280	add:
				6281	list_add(&req->timeout.list, entry);
				6282	data->timer.function = io_timeout_fn;
				6283	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
				6284	spin_unlock_irq(&ctx->timeout_lock);
				6285	return 0;
				6286	}
				6287
				6288	struct io_cancel_data {
				6289	struct io_ring_ctx *ctx;
				6290	u64 user_data;
				6291	};
				6292
				6293	static bool io_cancel_cb(struct io_wq_work work, void data)
				6294	{
				6295	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				6296	struct io_cancel_data *cd = data;
				6297
				6298	return req->ctx == cd->ctx && req->user_data == cd->user_data;
				6299	}
				6300
				6301	static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
				6302	struct io_ring_ctx *ctx)
				6303	{
				6304	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
				6305	enum io_wq_cancel cancel_ret;
				6306	int ret = 0;
				6307
				6308	if (!tctx \|\| !tctx->io_wq)
				6309	return -ENOENT;
				6310
				6311	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
				6312	switch (cancel_ret) {
				6313	case IO_WQ_CANCEL_OK:
				6314	ret = 0;
				6315	break;
				6316	case IO_WQ_CANCEL_RUNNING:
				6317	ret = -EALREADY;
				6318	break;
				6319	case IO_WQ_CANCEL_NOTFOUND:
				6320	ret = -ENOENT;
				6321	break;
				6322	}
				6323
				6324	return ret;
				6325	}
				6326
				6327	static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
				6328	{
				6329	struct io_ring_ctx *ctx = req->ctx;
				6330	int ret;
				6331
				6332	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
				6333
				6334	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
				6335	if (ret != -ENOENT)
				6336	return ret;
				6337
				6338	spin_lock(&ctx->completion_lock);
				6339	spin_lock_irq(&ctx->timeout_lock);
				6340	ret = io_timeout_cancel(ctx, sqe_addr);
				6341	spin_unlock_irq(&ctx->timeout_lock);
				6342	if (ret != -ENOENT)
				6343	goto out;
				6344	ret = io_poll_cancel(ctx, sqe_addr, false);
				6345	out:
				6346	spin_unlock(&ctx->completion_lock);
				6347	return ret;
				6348	}
				6349
				6350	static int io_async_cancel_prep(struct io_kiocb *req,
				6351	const struct io_uring_sqe *sqe)
				6352	{
				6353	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
				6354	return -EINVAL;
				6355	if (unlikely(req->flags & (REQ_F_FIXED_FILE \| REQ_F_BUFFER_SELECT)))
				6356	return -EINVAL;
				6357	if (sqe->ioprio \|\| sqe->off \|\| sqe->len \|\| sqe->cancel_flags \|\|
				6358	sqe->splice_fd_in)
				6359	return -EINVAL;
				6360
				6361	req->cancel.addr = READ_ONCE(sqe->addr);
				6362	return 0;
				6363	}
				6364
				6365	static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
				6366	{
				6367	struct io_ring_ctx *ctx = req->ctx;
				6368	u64 sqe_addr = req->cancel.addr;
				6369	struct io_tctx_node *node;
				6370	int ret;
				6371
				6372	ret = io_try_cancel_userdata(req, sqe_addr);
				6373	if (ret != -ENOENT)
				6374	goto done;
				6375
				6376	/* slow path, try all io-wq's */
				6377	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				6378	ret = -ENOENT;
				6379	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
				6380	struct io_uring_task *tctx = node->task->io_uring;
				6381
				6382	ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
				6383	if (ret != -ENOENT)
				6384	break;
				6385	}
				6386	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				6387	done:
				6388	if (ret < 0)
				6389	req_set_fail(req);
				6390	io_req_complete_post(req, ret, 0);
				6391	return 0;
				6392	}
				6393
				6394	static int io_rsrc_update_prep(struct io_kiocb *req,
				6395	const struct io_uring_sqe *sqe)
				6396	{
				6397	if (unlikely(req->flags & (REQ_F_FIXED_FILE \| REQ_F_BUFFER_SELECT)))
				6398	return -EINVAL;
				6399	if (sqe->ioprio \|\| sqe->rw_flags \|\| sqe->splice_fd_in)
				6400	return -EINVAL;
				6401
				6402	req->rsrc_update.offset = READ_ONCE(sqe->off);
				6403	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
				6404	if (!req->rsrc_update.nr_args)
				6405	return -EINVAL;
				6406	req->rsrc_update.arg = READ_ONCE(sqe->addr);
				6407	return 0;
				6408	}
				6409
				6410	static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
				6411	{
				6412	struct io_ring_ctx *ctx = req->ctx;
				6413	struct io_uring_rsrc_update2 up;
				6414	int ret;
				6415
				6416	up.offset = req->rsrc_update.offset;
				6417	up.data = req->rsrc_update.arg;
				6418	up.nr = 0;
				6419	up.tags = 0;
				6420	up.resv = 0;
				6421	up.resv2 = 0;
				6422
				6423	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				6424	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
				6425	&up, req->rsrc_update.nr_args);
				6426	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				6427
				6428	if (ret < 0)
				6429	req_set_fail(req);
				6430	__io_req_complete(req, issue_flags, ret, 0);
				6431	return 0;
				6432	}
				6433
				6434	static int io_req_prep(struct io_kiocb req, const struct io_uring_sqe sqe)
				6435	{
				6436	switch (req->opcode) {
				6437	case IORING_OP_NOP:
				6438	return 0;
				6439	case IORING_OP_READV:
				6440	case IORING_OP_READ_FIXED:
				6441	case IORING_OP_READ:
				6442	return io_read_prep(req, sqe);
				6443	case IORING_OP_WRITEV:
				6444	case IORING_OP_WRITE_FIXED:
				6445	case IORING_OP_WRITE:
				6446	return io_write_prep(req, sqe);
				6447	case IORING_OP_POLL_ADD:
				6448	return io_poll_add_prep(req, sqe);
				6449	case IORING_OP_POLL_REMOVE:
				6450	return io_poll_update_prep(req, sqe);
				6451	case IORING_OP_FSYNC:
				6452	return io_fsync_prep(req, sqe);
				6453	case IORING_OP_SYNC_FILE_RANGE:
				6454	return io_sfr_prep(req, sqe);
				6455	case IORING_OP_SENDMSG:
				6456	case IORING_OP_SEND:
				6457	return io_sendmsg_prep(req, sqe);
				6458	case IORING_OP_RECVMSG:
				6459	case IORING_OP_RECV:
				6460	return io_recvmsg_prep(req, sqe);
				6461	case IORING_OP_CONNECT:
				6462	return io_connect_prep(req, sqe);
				6463	case IORING_OP_TIMEOUT:
				6464	return io_timeout_prep(req, sqe, false);
				6465	case IORING_OP_TIMEOUT_REMOVE:
				6466	return io_timeout_remove_prep(req, sqe);
				6467	case IORING_OP_ASYNC_CANCEL:
				6468	return io_async_cancel_prep(req, sqe);
				6469	case IORING_OP_LINK_TIMEOUT:
				6470	return io_timeout_prep(req, sqe, true);
				6471	case IORING_OP_ACCEPT:
				6472	return io_accept_prep(req, sqe);
				6473	case IORING_OP_FALLOCATE:
				6474	return io_fallocate_prep(req, sqe);
				6475	case IORING_OP_OPENAT:
				6476	return io_openat_prep(req, sqe);
				6477	case IORING_OP_CLOSE:
				6478	return io_close_prep(req, sqe);
				6479	case IORING_OP_FILES_UPDATE:
				6480	return io_rsrc_update_prep(req, sqe);
				6481	case IORING_OP_STATX:
				6482	return io_statx_prep(req, sqe);
				6483	case IORING_OP_FADVISE:
				6484	return io_fadvise_prep(req, sqe);
				6485	case IORING_OP_MADVISE:
				6486	return io_madvise_prep(req, sqe);
				6487	case IORING_OP_OPENAT2:
				6488	return io_openat2_prep(req, sqe);
				6489	case IORING_OP_EPOLL_CTL:
				6490	return io_epoll_ctl_prep(req, sqe);
				6491	case IORING_OP_SPLICE:
				6492	return io_splice_prep(req, sqe);
				6493	case IORING_OP_PROVIDE_BUFFERS:
				6494	return io_provide_buffers_prep(req, sqe);
				6495	case IORING_OP_REMOVE_BUFFERS:
				6496	return io_remove_buffers_prep(req, sqe);
				6497	case IORING_OP_TEE:
				6498	return io_tee_prep(req, sqe);
				6499	case IORING_OP_SHUTDOWN:
				6500	return io_shutdown_prep(req, sqe);
				6501	case IORING_OP_RENAMEAT:
				6502	return io_renameat_prep(req, sqe);
				6503	case IORING_OP_UNLINKAT:
				6504	return io_unlinkat_prep(req, sqe);
				6505	}
				6506
				6507	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
				6508	req->opcode);
				6509	return -EINVAL;
				6510	}
				6511
				6512	static int io_req_prep_async(struct io_kiocb *req)
				6513	{
				6514	if (!io_op_defs[req->opcode].needs_async_setup)
				6515	return 0;
				6516	if (WARN_ON_ONCE(req->async_data))
				6517	return -EFAULT;
				6518	if (io_alloc_async_data(req))
				6519	return -EAGAIN;
				6520
				6521	switch (req->opcode) {
				6522	case IORING_OP_READV:
				6523	return io_rw_prep_async(req, READ);
				6524	case IORING_OP_WRITEV:
				6525	return io_rw_prep_async(req, WRITE);
				6526	case IORING_OP_SENDMSG:
				6527	return io_sendmsg_prep_async(req);
				6528	case IORING_OP_RECVMSG:
				6529	return io_recvmsg_prep_async(req);
				6530	case IORING_OP_CONNECT:
				6531	return io_connect_prep_async(req);
				6532	}
				6533	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
				6534	req->opcode);
				6535	return -EFAULT;
				6536	}
				6537
				6538	static u32 io_get_sequence(struct io_kiocb *req)
				6539	{
				6540	u32 seq = req->ctx->cached_sq_head;
				6541
				6542	/* need original cached_sq_head, but it was increased for each req */
				6543	io_for_each_link(req, req)
				6544	seq--;
				6545	return seq;
				6546	}
				6547
				6548	static bool io_drain_req(struct io_kiocb *req)
				6549	{
				6550	struct io_kiocb *pos;
				6551	struct io_ring_ctx *ctx = req->ctx;
				6552	struct io_defer_entry *de;
				6553	int ret;
				6554	u32 seq;
				6555
				6556	if (req->flags & REQ_F_FAIL) {
				6557	io_req_complete_fail_submit(req);
				6558	return true;
				6559	}
				6560
				6561	/*
				6562	* If we need to drain a request in the middle of a link, drain the
				6563	* head request and the next request/link after the current link.
				6564	* Considering sequential execution of links, IOSQE_IO_DRAIN will be
				6565	* maintained for every request of our link.
				6566	*/
				6567	if (ctx->drain_next) {
				6568	req->flags \|= REQ_F_IO_DRAIN;
				6569	ctx->drain_next = false;
				6570	}
				6571	/* not interested in head, start from the first linked */
				6572	io_for_each_link(pos, req->link) {
				6573	if (pos->flags & REQ_F_IO_DRAIN) {
				6574	ctx->drain_next = true;
				6575	req->flags \|= REQ_F_IO_DRAIN;
				6576	break;
				6577	}
				6578	}
				6579
				6580	/* Still need defer if there is pending req in defer list. */
				6581	spin_lock(&ctx->completion_lock);
				6582	if (likely(list_empty_careful(&ctx->defer_list) &&
				6583	!(req->flags & REQ_F_IO_DRAIN))) {
				6584	spin_unlock(&ctx->completion_lock);
				6585	ctx->drain_active = false;
				6586	return false;
				6587	}
				6588	spin_unlock(&ctx->completion_lock);
				6589
				6590	seq = io_get_sequence(req);
				6591	/* Still a chance to pass the sequence check */
				6592	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
				6593	return false;
				6594
				6595	ret = io_req_prep_async(req);
				6596	if (ret)
				6597	goto fail;
				6598	io_prep_async_link(req);
				6599	de = kmalloc(sizeof(*de), GFP_KERNEL);
				6600	if (!de) {
				6601	ret = -ENOMEM;
				6602	fail:
				6603	io_req_complete_failed(req, ret);
				6604	return true;
				6605	}
				6606
				6607	spin_lock(&ctx->completion_lock);
				6608	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
				6609	spin_unlock(&ctx->completion_lock);
				6610	kfree(de);
				6611	io_queue_async_work(req, NULL);
				6612	return true;
				6613	}
				6614
				6615	trace_io_uring_defer(ctx, req, req->user_data);
				6616	de->req = req;
				6617	de->seq = seq;
				6618	list_add_tail(&de->list, &ctx->defer_list);
				6619	spin_unlock(&ctx->completion_lock);
				6620	return true;
				6621	}
				6622
				6623	static void io_clean_op(struct io_kiocb *req)
				6624	{
				6625	if (req->flags & REQ_F_BUFFER_SELECTED) {
				6626	switch (req->opcode) {
				6627	case IORING_OP_READV:
				6628	case IORING_OP_READ_FIXED:
				6629	case IORING_OP_READ:
				6630	kfree((void *)(unsigned long)req->rw.addr);
				6631	break;
				6632	case IORING_OP_RECVMSG:
				6633	case IORING_OP_RECV:
				6634	kfree(req->sr_msg.kbuf);
				6635	break;
				6636	}
				6637	}
				6638
				6639	if (req->flags & REQ_F_NEED_CLEANUP) {
				6640	switch (req->opcode) {
				6641	case IORING_OP_READV:
				6642	case IORING_OP_READ_FIXED:
				6643	case IORING_OP_READ:
				6644	case IORING_OP_WRITEV:
				6645	case IORING_OP_WRITE_FIXED:
				6646	case IORING_OP_WRITE: {
				6647	struct io_async_rw *io = req->async_data;
				6648
				6649	kfree(io->free_iovec);
				6650	break;
				6651	}
				6652	case IORING_OP_RECVMSG:
				6653	case IORING_OP_SENDMSG: {
				6654	struct io_async_msghdr *io = req->async_data;
				6655
				6656	kfree(io->free_iov);
				6657	break;
				6658	}
				6659	case IORING_OP_OPENAT:
				6660	case IORING_OP_OPENAT2:
				6661	if (req->open.filename)
				6662	putname(req->open.filename);
				6663	break;
				6664	case IORING_OP_RENAMEAT:
				6665	putname(req->rename.oldpath);
				6666	putname(req->rename.newpath);
				6667	break;
				6668	case IORING_OP_UNLINKAT:
				6669	putname(req->unlink.filename);
				6670	break;
				6671	}
				6672	}
				6673	if ((req->flags & REQ_F_POLLED) && req->apoll) {
				6674	kfree(req->apoll->double_poll);
				6675	kfree(req->apoll);
				6676	req->apoll = NULL;
				6677	}
				6678	if (req->flags & REQ_F_INFLIGHT) {
				6679	struct io_uring_task *tctx = req->task->io_uring;
				6680
				6681	atomic_dec(&tctx->inflight_tracked);
				6682	}
				6683	if (req->flags & REQ_F_CREDS)
				6684	put_cred(req->creds);
				6685
				6686	req->flags &= ~IO_REQ_CLEAN_FLAGS;
				6687	}
				6688
				6689	static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
				6690	{
				6691	struct io_ring_ctx *ctx = req->ctx;
				6692	const struct cred *creds = NULL;
				6693	int ret;
				6694
				6695	if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
				6696	creds = override_creds(req->creds);
				6697
				6698	switch (req->opcode) {
				6699	case IORING_OP_NOP:
				6700	ret = io_nop(req, issue_flags);
				6701	break;
				6702	case IORING_OP_READV:
				6703	case IORING_OP_READ_FIXED:
				6704	case IORING_OP_READ:
				6705	ret = io_read(req, issue_flags);
				6706	break;
				6707	case IORING_OP_WRITEV:
				6708	case IORING_OP_WRITE_FIXED:
				6709	case IORING_OP_WRITE:
				6710	ret = io_write(req, issue_flags);
				6711	break;
				6712	case IORING_OP_FSYNC:
				6713	ret = io_fsync(req, issue_flags);
				6714	break;
				6715	case IORING_OP_POLL_ADD:
				6716	ret = io_poll_add(req, issue_flags);
				6717	break;
				6718	case IORING_OP_POLL_REMOVE:
				6719	ret = io_poll_update(req, issue_flags);
				6720	break;
				6721	case IORING_OP_SYNC_FILE_RANGE:
				6722	ret = io_sync_file_range(req, issue_flags);
				6723	break;
				6724	case IORING_OP_SENDMSG:
				6725	ret = io_sendmsg(req, issue_flags);
				6726	break;
				6727	case IORING_OP_SEND:
				6728	ret = io_send(req, issue_flags);
				6729	break;
				6730	case IORING_OP_RECVMSG:
				6731	ret = io_recvmsg(req, issue_flags);
				6732	break;
				6733	case IORING_OP_RECV:
				6734	ret = io_recv(req, issue_flags);
				6735	break;
				6736	case IORING_OP_TIMEOUT:
				6737	ret = io_timeout(req, issue_flags);
				6738	break;
				6739	case IORING_OP_TIMEOUT_REMOVE:
				6740	ret = io_timeout_remove(req, issue_flags);
				6741	break;
				6742	case IORING_OP_ACCEPT:
				6743	ret = io_accept(req, issue_flags);
				6744	break;
				6745	case IORING_OP_CONNECT:
				6746	ret = io_connect(req, issue_flags);
				6747	break;
				6748	case IORING_OP_ASYNC_CANCEL:
				6749	ret = io_async_cancel(req, issue_flags);
				6750	break;
				6751	case IORING_OP_FALLOCATE:
				6752	ret = io_fallocate(req, issue_flags);
				6753	break;
				6754	case IORING_OP_OPENAT:
				6755	ret = io_openat(req, issue_flags);
				6756	break;
				6757	case IORING_OP_CLOSE:
				6758	ret = io_close(req, issue_flags);
				6759	break;
				6760	case IORING_OP_FILES_UPDATE:
				6761	ret = io_files_update(req, issue_flags);
				6762	break;
				6763	case IORING_OP_STATX:
				6764	ret = io_statx(req, issue_flags);
				6765	break;
				6766	case IORING_OP_FADVISE:
				6767	ret = io_fadvise(req, issue_flags);
				6768	break;
				6769	case IORING_OP_MADVISE:
				6770	ret = io_madvise(req, issue_flags);
				6771	break;
				6772	case IORING_OP_OPENAT2:
				6773	ret = io_openat2(req, issue_flags);
				6774	break;
				6775	case IORING_OP_EPOLL_CTL:
				6776	ret = io_epoll_ctl(req, issue_flags);
				6777	break;
				6778	case IORING_OP_SPLICE:
				6779	ret = io_splice(req, issue_flags);
				6780	break;
				6781	case IORING_OP_PROVIDE_BUFFERS:
				6782	ret = io_provide_buffers(req, issue_flags);
				6783	break;
				6784	case IORING_OP_REMOVE_BUFFERS:
				6785	ret = io_remove_buffers(req, issue_flags);
				6786	break;
				6787	case IORING_OP_TEE:
				6788	ret = io_tee(req, issue_flags);
				6789	break;
				6790	case IORING_OP_SHUTDOWN:
				6791	ret = io_shutdown(req, issue_flags);
				6792	break;
				6793	case IORING_OP_RENAMEAT:
				6794	ret = io_renameat(req, issue_flags);
				6795	break;
				6796	case IORING_OP_UNLINKAT:
				6797	ret = io_unlinkat(req, issue_flags);
				6798	break;
				6799	default:
				6800	ret = -EINVAL;
				6801	break;
				6802	}
				6803
				6804	if (creds)
				6805	revert_creds(creds);
				6806	if (ret)
				6807	return ret;
				6808	/* If the op doesn't have a file, we're not polling for it */
				6809	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
				6810	io_iopoll_req_issued(req);
				6811
				6812	return 0;
				6813	}
				6814
				6815	static struct io_wq_work io_wq_free_work(struct io_wq_work work)
				6816	{
				6817	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				6818
				6819	req = io_put_req_find_next(req);
				6820	return req ? &req->work : NULL;
				6821	}
				6822
				6823	static void io_wq_submit_work(struct io_wq_work *work)
				6824	{
				6825	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				6826	struct io_kiocb *timeout;
				6827	int ret = 0;
				6828
				6829	/* one will be dropped by ->io_free_work() after returning to io-wq */
				6830	if (!(req->flags & REQ_F_REFCOUNT))
				6831	__io_req_set_refcount(req, 2);
				6832	else
				6833	req_ref_get(req);
				6834
				6835	timeout = io_prep_linked_timeout(req);
				6836	if (timeout)
				6837	io_queue_linked_timeout(timeout);
				6838
				6839	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
				6840	if (work->flags & IO_WQ_WORK_CANCEL)
				6841	ret = -ECANCELED;
				6842
				6843	if (!ret) {
				6844	do {
				6845	ret = io_issue_sqe(req, 0);
				6846	/*
				6847	* We can get EAGAIN for polled IO even though we're
				6848	* forcing a sync submission from here, since we can't
				6849	* wait for request slots on the block side.
				6850	*/
				6851	if (ret != -EAGAIN \|\| !(req->ctx->flags & IORING_SETUP_IOPOLL))
				6852	break;
				6853	cond_resched();
				6854	} while (1);
				6855	}
				6856
				6857	/* avoid locking problems by failing it from a clean context */
				6858	if (ret)
				6859	io_req_task_queue_fail(req, ret);
				6860	}
				6861
				6862	static inline struct io_fixed_file io_fixed_file_slot(struct io_file_table table,
				6863	unsigned i)
				6864	{
				6865	return &table->files[i];
				6866	}
				6867
				6868	static inline struct file io_file_from_index(struct io_ring_ctx ctx,
				6869	int index)
				6870	{
				6871	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
				6872
				6873	return (struct file *) (slot->file_ptr & FFS_MASK);
				6874	}
				6875
				6876	static void io_fixed_file_set(struct io_fixed_file file_slot, struct file file)
				6877	{
				6878	unsigned long file_ptr = (unsigned long) file;
				6879
				6880	if (__io_file_supports_nowait(file, READ))
				6881	file_ptr \|= FFS_ASYNC_READ;
				6882	if (__io_file_supports_nowait(file, WRITE))
				6883	file_ptr \|= FFS_ASYNC_WRITE;
				6884	if (S_ISREG(file_inode(file)->i_mode))
				6885	file_ptr \|= FFS_ISREG;
				6886	file_slot->file_ptr = file_ptr;
				6887	}
				6888
				6889	static inline struct file io_file_get_fixed(struct io_ring_ctx ctx,
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6890	struct io_kiocb *req, int fd,
				6891	unsigned int issue_flags)
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6892	{
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6893	struct file *file = NULL;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6894	unsigned long file_ptr;
				6895
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6896	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				6897
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6898	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6899	goto out;
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6900	fd = array_index_nospec(fd, ctx->nr_user_files);
				6901	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
				6902	file = (struct file *) (file_ptr & FFS_MASK);
				6903	file_ptr &= ~FFS_MASK;
				6904	/* mask in overlapping REQ_F and FFS bits */
				6905	req->flags \|= (file_ptr << REQ_F_NOWAIT_READ_BIT);
				6906	io_req_set_rsrc_node(req);
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6907	out:
				6908	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6909	return file;
				6910	}
				6911
				6912	static struct file io_file_get_normal(struct io_ring_ctx ctx,
				6913	struct io_kiocb *req, int fd)
				6914	{
				6915	struct file *file = fget(fd);
				6916
				6917	trace_io_uring_file_get(ctx, fd);
				6918
				6919	/* we don't allow fixed io_uring files */
				6920	if (file && unlikely(file->f_op == &io_uring_fops))
				6921	io_req_track_inflight(req);
				6922	return file;
				6923	}
				6924
				6925	static inline struct file io_file_get(struct io_ring_ctx ctx,
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6926	struct io_kiocb *req, int fd, bool fixed,
				6927	unsigned int issue_flags)
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6928	{
				6929	if (fixed)
Bing-Jhong Billy Jheng	be56ff5	2023-03-02 21:00:06 +0800	[diff] [blame]	6930	return io_file_get_fixed(ctx, req, fd, issue_flags);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	6931	else
				6932	return io_file_get_normal(ctx, req, fd);
				6933	}
				6934
				6935	static void io_req_task_link_timeout(struct io_kiocb req, bool locked)
				6936	{
				6937	struct io_kiocb *prev = req->timeout.prev;
				6938	int ret = -ENOENT;
				6939
				6940	if (prev) {
				6941	if (!(req->task->flags & PF_EXITING))
				6942	ret = io_try_cancel_userdata(req, prev->user_data);
				6943	io_req_complete_post(req, ret ?: -ETIME, 0);
				6944	io_put_req(prev);
				6945	} else {
				6946	io_req_complete_post(req, -ETIME, 0);
				6947	}
				6948	}
				6949
				6950	static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
				6951	{
				6952	struct io_timeout_data *data = container_of(timer,
				6953	struct io_timeout_data, timer);
				6954	struct io_kiocb prev, req = data->req;
				6955	struct io_ring_ctx *ctx = req->ctx;
				6956	unsigned long flags;
				6957
				6958	spin_lock_irqsave(&ctx->timeout_lock, flags);
				6959	prev = req->timeout.head;
				6960	req->timeout.head = NULL;
				6961
				6962	/*
				6963	* We don't expect the list to be empty, that will only happen if we
				6964	* race with the completion of the linked work.
				6965	*/
				6966	if (prev) {
				6967	io_remove_next_linked(prev);
				6968	if (!req_ref_inc_not_zero(prev))
				6969	prev = NULL;
				6970	}
				6971	list_del(&req->timeout.list);
				6972	req->timeout.prev = prev;
				6973	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
				6974
				6975	req->io_task_work.func = io_req_task_link_timeout;
				6976	io_req_task_work_add(req);
				6977	return HRTIMER_NORESTART;
				6978	}
				6979
				6980	static void io_queue_linked_timeout(struct io_kiocb *req)
				6981	{
				6982	struct io_ring_ctx *ctx = req->ctx;
				6983
				6984	spin_lock_irq(&ctx->timeout_lock);
				6985	/*
				6986	* If the back reference is NULL, then our linked request finished
				6987	* before we got a chance to setup the timer
				6988	*/
				6989	if (req->timeout.head) {
				6990	struct io_timeout_data *data = req->async_data;
				6991
				6992	data->timer.function = io_link_timeout_fn;
				6993	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
				6994	data->mode);
				6995	list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
				6996	}
				6997	spin_unlock_irq(&ctx->timeout_lock);
				6998	/* drop submission reference */
				6999	io_put_req(req);
				7000	}
				7001
				7002	static void __io_queue_sqe(struct io_kiocb *req)
				7003	__must_hold(&req->ctx->uring_lock)
				7004	{
				7005	struct io_kiocb *linked_timeout;
				7006	int ret;
				7007
				7008	issue_sqe:
				7009	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK\|IO_URING_F_COMPLETE_DEFER);
				7010
				7011	/*
				7012	* We async punt it if the file wasn't marked NOWAIT, or if the file
				7013	* doesn't support non-blocking read/write attempts
				7014	*/
				7015	if (likely(!ret)) {
				7016	if (req->flags & REQ_F_COMPLETE_INLINE) {
				7017	struct io_ring_ctx *ctx = req->ctx;
				7018	struct io_submit_state *state = &ctx->submit_state;
				7019
				7020	state->compl_reqs[state->compl_nr++] = req;
				7021	if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
				7022	io_submit_flush_completions(ctx);
				7023	return;
				7024	}
				7025
				7026	linked_timeout = io_prep_linked_timeout(req);
				7027	if (linked_timeout)
				7028	io_queue_linked_timeout(linked_timeout);
				7029	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
				7030	linked_timeout = io_prep_linked_timeout(req);
				7031
				7032	switch (io_arm_poll_handler(req)) {
				7033	case IO_APOLL_READY:
				7034	if (linked_timeout)
				7035	io_queue_linked_timeout(linked_timeout);
				7036	goto issue_sqe;
				7037	case IO_APOLL_ABORTED:
				7038	/*
				7039	* Queued up for async execution, worker will release
				7040	* submit reference when the iocb is actually submitted.
				7041	*/
				7042	io_queue_async_work(req, NULL);
				7043	break;
				7044	}
				7045
				7046	if (linked_timeout)
				7047	io_queue_linked_timeout(linked_timeout);
				7048	} else {
				7049	io_req_complete_failed(req, ret);
				7050	}
				7051	}
				7052
				7053	static inline void io_queue_sqe(struct io_kiocb *req)
				7054	__must_hold(&req->ctx->uring_lock)
				7055	{
				7056	if (unlikely(req->ctx->drain_active) && io_drain_req(req))
				7057	return;
				7058
				7059	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC \| REQ_F_FAIL)))) {
				7060	__io_queue_sqe(req);
				7061	} else if (req->flags & REQ_F_FAIL) {
				7062	io_req_complete_fail_submit(req);
				7063	} else {
				7064	int ret = io_req_prep_async(req);
				7065
				7066	if (unlikely(ret))
				7067	io_req_complete_failed(req, ret);
				7068	else
				7069	io_queue_async_work(req, NULL);
				7070	}
				7071	}
				7072
				7073	/*
				7074	* Check SQE restrictions (opcode and flags).
				7075	*
				7076	* Returns 'true' if SQE is allowed, 'false' otherwise.
				7077	*/
				7078	static inline bool io_check_restriction(struct io_ring_ctx *ctx,
				7079	struct io_kiocb *req,
				7080	unsigned int sqe_flags)
				7081	{
				7082	if (likely(!ctx->restricted))
				7083	return true;
				7084
				7085	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
				7086	return false;
				7087
				7088	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
				7089	ctx->restrictions.sqe_flags_required)
				7090	return false;
				7091
				7092	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed \|
				7093	ctx->restrictions.sqe_flags_required))
				7094	return false;
				7095
				7096	return true;
				7097	}
				7098
				7099	static int io_init_req(struct io_ring_ctx ctx, struct io_kiocb req,
				7100	const struct io_uring_sqe *sqe)
				7101	__must_hold(&ctx->uring_lock)
				7102	{
				7103	struct io_submit_state *state;
				7104	unsigned int sqe_flags;
				7105	int personality, ret = 0;
				7106
				7107	/* req is partially pre-initialised, see io_preinit_req() */
				7108	req->opcode = READ_ONCE(sqe->opcode);
				7109	/* same numerical values with corresponding REQ_F_, safe to copy /
				7110	req->flags = sqe_flags = READ_ONCE(sqe->flags);
				7111	req->user_data = READ_ONCE(sqe->user_data);
				7112	req->file = NULL;
				7113	req->fixed_rsrc_refs = NULL;
				7114	req->task = current;
				7115
				7116	/* enforce forwards compatibility on users */
				7117	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
				7118	return -EINVAL;
				7119	if (unlikely(req->opcode >= IORING_OP_LAST))
				7120	return -EINVAL;
				7121	if (!io_check_restriction(ctx, req, sqe_flags))
				7122	return -EACCES;
				7123
				7124	if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
				7125	!io_op_defs[req->opcode].buffer_select)
				7126	return -EOPNOTSUPP;
				7127	if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
				7128	ctx->drain_active = true;
				7129
				7130	personality = READ_ONCE(sqe->personality);
				7131	if (personality) {
				7132	req->creds = xa_load(&ctx->personalities, personality);
				7133	if (!req->creds)
				7134	return -EINVAL;
				7135	get_cred(req->creds);
				7136	req->flags \|= REQ_F_CREDS;
				7137	}
				7138	state = &ctx->submit_state;
				7139
				7140	/*
				7141	* Plug now if we have more than 1 IO left after this, and the target
				7142	* is potentially a read/write to block based storage.
				7143	*/
				7144	if (!state->plug_started && state->ios_left > 1 &&
				7145	io_op_defs[req->opcode].plug) {
				7146	blk_start_plug(&state->plug);
				7147	state->plug_started = true;
				7148	}
				7149
				7150	if (io_op_defs[req->opcode].needs_file) {
				7151	req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
Jens Axboe	14f61184	2023-03-03 06:49:57 -0700	[diff] [blame]	7152	(sqe_flags & IOSQE_FIXED_FILE),
				7153	IO_URING_F_NONBLOCK);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	7154	if (unlikely(!req->file))
				7155	ret = -EBADF;
				7156	}
				7157
				7158	state->ios_left--;
				7159	return ret;
				7160	}
				7161
				7162	static int io_submit_sqe(struct io_ring_ctx ctx, struct io_kiocb req,
				7163	const struct io_uring_sqe *sqe)
				7164	__must_hold(&ctx->uring_lock)
				7165	{
				7166	struct io_submit_link *link = &ctx->submit_state.link;
				7167	int ret;
				7168
				7169	ret = io_init_req(ctx, req, sqe);
				7170	if (unlikely(ret)) {
				7171	fail_req:
				7172	/* fail even hard links since we don't submit */
				7173	if (link->head) {
				7174	/*
				7175	* we can judge a link req is failed or cancelled by if
				7176	* REQ_F_FAIL is set, but the head is an exception since
				7177	* it may be set REQ_F_FAIL because of other req's failure
				7178	* so let's leverage req->result to distinguish if a head
				7179	* is set REQ_F_FAIL because of its failure or other req's
				7180	* failure so that we can set the correct ret code for it.
				7181	* init result here to avoid affecting the normal path.
				7182	*/
				7183	if (!(link->head->flags & REQ_F_FAIL))
				7184	req_fail_link_node(link->head, -ECANCELED);
				7185	} else if (!(req->flags & (REQ_F_LINK \| REQ_F_HARDLINK))) {
				7186	/*
				7187	* the current req is a normal req, we should return
				7188	* error and thus break the submittion loop.
				7189	*/
				7190	io_req_complete_failed(req, ret);
				7191	return ret;
				7192	}
				7193	req_fail_link_node(req, ret);
				7194	} else {
				7195	ret = io_req_prep(req, sqe);
				7196	if (unlikely(ret))
				7197	goto fail_req;
				7198	}
				7199
				7200	/* don't need @sqe from now on */
				7201	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
				7202	req->flags, true,
				7203	ctx->flags & IORING_SETUP_SQPOLL);
				7204
				7205	/*
				7206	* If we already have a head request, queue this one for async
				7207	* submittal once the head completes. If we don't have a head but
				7208	* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
				7209	* submitted sync once the chain is complete. If none of those
				7210	* conditions are true (normal request), then just queue it.
				7211	*/
				7212	if (link->head) {
				7213	struct io_kiocb *head = link->head;
				7214
				7215	if (!(req->flags & REQ_F_FAIL)) {
				7216	ret = io_req_prep_async(req);
				7217	if (unlikely(ret)) {
				7218	req_fail_link_node(req, ret);
				7219	if (!(head->flags & REQ_F_FAIL))
				7220	req_fail_link_node(head, -ECANCELED);
				7221	}
				7222	}
				7223	trace_io_uring_link(ctx, req, head);
				7224	link->last->link = req;
				7225	link->last = req;
				7226
				7227	/* last request of a link, enqueue the link */
				7228	if (!(req->flags & (REQ_F_LINK \| REQ_F_HARDLINK))) {
				7229	link->head = NULL;
				7230	io_queue_sqe(head);
				7231	}
				7232	} else {
				7233	if (req->flags & (REQ_F_LINK \| REQ_F_HARDLINK)) {
				7234	link->head = req;
				7235	link->last = req;
				7236	} else {
				7237	io_queue_sqe(req);
				7238	}
				7239	}
				7240
				7241	return 0;
				7242	}
				7243
				7244	/*
				7245	* Batched submission is done, ensure local IO is flushed out.
				7246	*/
				7247	static void io_submit_state_end(struct io_submit_state *state,
				7248	struct io_ring_ctx *ctx)
				7249	{
				7250	if (state->link.head)
				7251	io_queue_sqe(state->link.head);
				7252	if (state->compl_nr)
				7253	io_submit_flush_completions(ctx);
				7254	if (state->plug_started)
				7255	blk_finish_plug(&state->plug);
				7256	}
				7257
				7258	/*
				7259	* Start submission side cache.
				7260	*/
				7261	static void io_submit_state_start(struct io_submit_state *state,
				7262	unsigned int max_ios)
				7263	{
				7264	state->plug_started = false;
				7265	state->ios_left = max_ios;
				7266	/* set only head, no need to init link_last in advance */
				7267	state->link.head = NULL;
				7268	}
				7269
				7270	static void io_commit_sqring(struct io_ring_ctx *ctx)
				7271	{
				7272	struct io_rings *rings = ctx->rings;
				7273
				7274	/*
				7275	* Ensure any loads from the SQEs are done at this point,
				7276	* since once we write the new head, the application could
				7277	* write new data to them.
				7278	*/
				7279	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
				7280	}
				7281
				7282	/*
				7283	* Fetch an sqe, if one is available. Note this returns a pointer to memory
				7284	* that is mapped by userspace. This means that care needs to be taken to
				7285	* ensure that reads are stable, as we cannot rely on userspace always
				7286	* being a good citizen. If members of the sqe are validated and then later
				7287	* used, it's important that those reads are done through READ_ONCE() to
				7288	* prevent a re-load down the line.
				7289	*/
				7290	static const struct io_uring_sqe io_get_sqe(struct io_ring_ctx ctx)
				7291	{
				7292	unsigned head, mask = ctx->sq_entries - 1;
				7293	unsigned sq_idx = ctx->cached_sq_head++ & mask;
				7294
				7295	/*
				7296	* The cached sq head (or cq tail) serves two purposes:
				7297	*
				7298	* 1) allows us to batch the cost of updating the user visible
				7299	* head updates.
				7300	* 2) allows the kernel side to track the head on its own, even
				7301	* though the application is the one updating it.
				7302	*/
				7303	head = READ_ONCE(ctx->sq_array[sq_idx]);
				7304	if (likely(head < ctx->sq_entries))
				7305	return &ctx->sq_sqes[head];
				7306
				7307	/* drop invalid entries */
				7308	ctx->cq_extra--;
				7309	WRITE_ONCE(ctx->rings->sq_dropped,
				7310	READ_ONCE(ctx->rings->sq_dropped) + 1);
				7311	return NULL;
				7312	}
				7313
				7314	static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
				7315	__must_hold(&ctx->uring_lock)
				7316	{
				7317	int submitted = 0;
				7318
				7319	/* make sure SQ entry isn't read before tail */
				7320	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
				7321	if (!percpu_ref_tryget_many(&ctx->refs, nr))
				7322	return -EAGAIN;
				7323	io_get_task_refs(nr);
				7324
				7325	io_submit_state_start(&ctx->submit_state, nr);
				7326	while (submitted < nr) {
				7327	const struct io_uring_sqe *sqe;
				7328	struct io_kiocb *req;
				7329
				7330	req = io_alloc_req(ctx);
				7331	if (unlikely(!req)) {
				7332	if (!submitted)
				7333	submitted = -EAGAIN;
				7334	break;
				7335	}
				7336	sqe = io_get_sqe(ctx);
				7337	if (unlikely(!sqe)) {
				7338	list_add(&req->inflight_entry, &ctx->submit_state.free_list);
				7339	break;
				7340	}
				7341	/* will complete beyond this point, count as submitted */
				7342	submitted++;
				7343	if (io_submit_sqe(ctx, req, sqe))
				7344	break;
				7345	}
				7346
				7347	if (unlikely(submitted != nr)) {
				7348	int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
				7349	int unused = nr - ref_used;
				7350
				7351	current->io_uring->cached_refs += unused;
				7352	percpu_ref_put_many(&ctx->refs, unused);
				7353	}
				7354
				7355	io_submit_state_end(&ctx->submit_state, ctx);
				7356	/* Commit SQ ring head once we've consumed and submitted all SQEs */
				7357	io_commit_sqring(ctx);
				7358
				7359	return submitted;
				7360	}
				7361
				7362	static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
				7363	{
				7364	return READ_ONCE(sqd->state);
				7365	}
				7366
				7367	static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
				7368	{
				7369	/* Tell userspace we may need a wakeup call */
				7370	spin_lock(&ctx->completion_lock);
				7371	WRITE_ONCE(ctx->rings->sq_flags,
				7372	ctx->rings->sq_flags \| IORING_SQ_NEED_WAKEUP);
				7373	spin_unlock(&ctx->completion_lock);
				7374	}
				7375
				7376	static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
				7377	{
				7378	spin_lock(&ctx->completion_lock);
				7379	WRITE_ONCE(ctx->rings->sq_flags,
				7380	ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
				7381	spin_unlock(&ctx->completion_lock);
				7382	}
				7383
				7384	static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
				7385	{
				7386	unsigned int to_submit;
				7387	int ret = 0;
				7388
				7389	to_submit = io_sqring_entries(ctx);
				7390	/* if we're handling multiple rings, cap submit size for fairness */
				7391	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
				7392	to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
				7393
				7394	if (!list_empty(&ctx->iopoll_list) \|\| to_submit) {
				7395	unsigned nr_events = 0;
				7396	const struct cred *creds = NULL;
				7397
				7398	if (ctx->sq_creds != current_cred())
				7399	creds = override_creds(ctx->sq_creds);
				7400
				7401	mutex_lock(&ctx->uring_lock);
				7402	if (!list_empty(&ctx->iopoll_list))
				7403	io_do_iopoll(ctx, &nr_events, 0);
				7404
				7405	/*
				7406	* Don't submit if refs are dying, good for io_uring_register(),
				7407	* but also it is relied upon by io_ring_exit_work()
				7408	*/
				7409	if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
				7410	!(ctx->flags & IORING_SETUP_R_DISABLED))
				7411	ret = io_submit_sqes(ctx, to_submit);
				7412	mutex_unlock(&ctx->uring_lock);
				7413
				7414	if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
				7415	wake_up(&ctx->sqo_sq_wait);
				7416	if (creds)
				7417	revert_creds(creds);
				7418	}
				7419
				7420	return ret;
				7421	}
				7422
				7423	static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
				7424	{
				7425	struct io_ring_ctx *ctx;
				7426	unsigned sq_thread_idle = 0;
				7427
				7428	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				7429	sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
				7430	sqd->sq_thread_idle = sq_thread_idle;
				7431	}
				7432
				7433	static bool io_sqd_handle_event(struct io_sq_data *sqd)
				7434	{
				7435	bool did_sig = false;
				7436	struct ksignal ksig;
				7437
				7438	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) \|\|
				7439	signal_pending(current)) {
				7440	mutex_unlock(&sqd->lock);
				7441	if (signal_pending(current))
				7442	did_sig = get_signal(&ksig);
				7443	cond_resched();
				7444	mutex_lock(&sqd->lock);
				7445	}
				7446	return did_sig \|\| test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
				7447	}
				7448
				7449	static int io_sq_thread(void *data)
				7450	{
				7451	struct io_sq_data *sqd = data;
				7452	struct io_ring_ctx *ctx;
				7453	unsigned long timeout = 0;
				7454	char buf[TASK_COMM_LEN];
				7455	DEFINE_WAIT(wait);
				7456
				7457	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
				7458	set_task_comm(current, buf);
				7459
				7460	if (sqd->sq_cpu != -1)
				7461	set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
				7462	else
				7463	set_cpus_allowed_ptr(current, cpu_online_mask);
				7464	current->flags \|= PF_NO_SETAFFINITY;
				7465
				7466	mutex_lock(&sqd->lock);
				7467	while (1) {
				7468	bool cap_entries, sqt_spin = false;
				7469
				7470	if (io_sqd_events_pending(sqd) \|\| signal_pending(current)) {
				7471	if (io_sqd_handle_event(sqd))
				7472	break;
				7473	timeout = jiffies + sqd->sq_thread_idle;
				7474	}
				7475
				7476	cap_entries = !list_is_singular(&sqd->ctx_list);
				7477	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
				7478	int ret = __io_sq_thread(ctx, cap_entries);
				7479
				7480	if (!sqt_spin && (ret > 0 \|\| !list_empty(&ctx->iopoll_list)))
				7481	sqt_spin = true;
				7482	}
				7483	if (io_run_task_work())
				7484	sqt_spin = true;
				7485
				7486	if (sqt_spin \|\| !time_after(jiffies, timeout)) {
				7487	cond_resched();
				7488	if (sqt_spin)
				7489	timeout = jiffies + sqd->sq_thread_idle;
				7490	continue;
				7491	}
				7492
				7493	prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
				7494	if (!io_sqd_events_pending(sqd) && !current->task_works) {
				7495	bool needs_sched = true;
				7496
				7497	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
				7498	io_ring_set_wakeup_flag(ctx);
				7499
				7500	if ((ctx->flags & IORING_SETUP_IOPOLL) &&
				7501	!list_empty_careful(&ctx->iopoll_list)) {
				7502	needs_sched = false;
				7503	break;
				7504	}
				7505	if (io_sqring_entries(ctx)) {
				7506	needs_sched = false;
				7507	break;
				7508	}
				7509	}
				7510
				7511	if (needs_sched) {
				7512	mutex_unlock(&sqd->lock);
				7513	schedule();
				7514	mutex_lock(&sqd->lock);
				7515	}
				7516	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				7517	io_ring_clear_wakeup_flag(ctx);
				7518	}
				7519
				7520	finish_wait(&sqd->wait, &wait);
				7521	timeout = jiffies + sqd->sq_thread_idle;
				7522	}
				7523
				7524	io_uring_cancel_generic(true, sqd);
				7525	sqd->thread = NULL;
				7526	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				7527	io_ring_set_wakeup_flag(ctx);
				7528	io_run_task_work();
				7529	mutex_unlock(&sqd->lock);
				7530
				7531	complete(&sqd->exited);
				7532	do_exit(0);
				7533	}
				7534
				7535	struct io_wait_queue {
				7536	struct wait_queue_entry wq;
				7537	struct io_ring_ctx *ctx;
				7538	unsigned cq_tail;
				7539	unsigned nr_timeouts;
				7540	};
				7541
				7542	static inline bool io_should_wake(struct io_wait_queue *iowq)
				7543	{
				7544	struct io_ring_ctx *ctx = iowq->ctx;
				7545	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
				7546
				7547	/*
				7548	* Wake up if we have enough events, or if a timeout occurred since we
				7549	* started waiting. For timeouts, we always want to return to userspace,
				7550	* regardless of event count.
				7551	*/
				7552	return dist >= 0 \|\| atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
				7553	}
				7554
				7555	static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
				7556	int wake_flags, void *key)
				7557	{
				7558	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
				7559	wq);
				7560
				7561	/*
				7562	* Cannot safely flush overflowed CQEs from here, ensure we wake up
				7563	* the task, and the next invocation will do it.
				7564	*/
				7565	if (io_should_wake(iowq) \|\| test_bit(0, &iowq->ctx->check_cq_overflow))
				7566	return autoremove_wake_function(curr, mode, wake_flags, key);
				7567	return -1;
				7568	}
				7569
				7570	static int io_run_task_work_sig(void)
				7571	{
				7572	if (io_run_task_work())
				7573	return 1;
				7574	if (!signal_pending(current))
				7575	return 0;
				7576	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
				7577	return -ERESTARTSYS;
				7578	return -EINTR;
				7579	}
				7580
				7581	/* when returns >0, the caller should retry */
				7582	static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
				7583	struct io_wait_queue *iowq,
Pavel Begunkov	928a9e5	2023-01-05 10:49:15 +0000	[diff] [blame]	7584	ktime_t *timeout)
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	7585	{
				7586	int ret;
				7587
				7588	/* make sure we run task_work before checking for signals */
				7589	ret = io_run_task_work_sig();
				7590	if (ret \|\| io_should_wake(iowq))
				7591	return ret;
				7592	/* let the caller flush overflows, retry */
				7593	if (test_bit(0, &ctx->check_cq_overflow))
				7594	return 1;
				7595
Pavel Begunkov	928a9e5	2023-01-05 10:49:15 +0000	[diff] [blame]	7596	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	7597	return -ETIME;
				7598	return 1;
				7599	}
				7600
				7601	/*
				7602	* Wait until events become available, if we don't already have some. The
				7603	* application must reap them itself, as they reside on the shared cq ring.
				7604	*/
				7605	static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
				7606	const sigset_t __user *sig, size_t sigsz,
				7607	struct __kernel_timespec __user *uts)
				7608	{
				7609	struct io_wait_queue iowq;
				7610	struct io_rings *rings = ctx->rings;
				7611	ktime_t timeout = KTIME_MAX;
				7612	int ret;
				7613
				7614	do {
				7615	io_cqring_overflow_flush(ctx);
				7616	if (io_cqring_events(ctx) >= min_events)
				7617	return 0;
				7618	if (!io_run_task_work())
				7619	break;
				7620	} while (1);
				7621
				7622	if (uts) {
				7623	struct timespec64 ts;
				7624
				7625	if (get_timespec64(&ts, uts))
				7626	return -EFAULT;
				7627	timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
				7628	}
				7629
				7630	if (sig) {
				7631	#ifdef CONFIG_COMPAT
				7632	if (in_compat_syscall())
				7633	ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
				7634	sigsz);
				7635	else
				7636	#endif
				7637	ret = set_user_sigmask(sig, sigsz);
				7638
				7639	if (ret)
				7640	return ret;
				7641	}
				7642
				7643	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
				7644	iowq.wq.private = current;
				7645	INIT_LIST_HEAD(&iowq.wq.entry);
				7646	iowq.ctx = ctx;
				7647	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
				7648	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
				7649
				7650	trace_io_uring_cqring_wait(ctx, min_events);
				7651	do {
				7652	/* if we can't even flush overflow, don't wait for more */
				7653	if (!io_cqring_overflow_flush(ctx)) {
				7654	ret = -EBUSY;
				7655	break;
				7656	}
				7657	prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
				7658	TASK_INTERRUPTIBLE);
Pavel Begunkov	928a9e5	2023-01-05 10:49:15 +0000	[diff] [blame]	7659	ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	7660	finish_wait(&ctx->cq_wait, &iowq.wq);
				7661	cond_resched();
				7662	} while (ret > 0);
				7663
				7664	restore_saved_sigmask_unless(ret == -EINTR);
				7665
				7666	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
				7667	}
				7668
				7669	static void io_free_page_table(void **table, size_t size)
				7670	{
				7671	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
				7672
				7673	for (i = 0; i < nr_tables; i++)
				7674	kfree(table[i]);
				7675	kfree(table);
				7676	}
				7677
				7678	static void **io_alloc_page_table(size_t size)
				7679	{
				7680	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
				7681	size_t init_size = size;
				7682	void **table;
				7683
				7684	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
				7685	if (!table)
				7686	return NULL;
				7687
				7688	for (i = 0; i < nr_tables; i++) {
				7689	unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
				7690
				7691	table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
				7692	if (!table[i]) {
				7693	io_free_page_table(table, init_size);
				7694	return NULL;
				7695	}
				7696	size -= this_size;
				7697	}
				7698	return table;
				7699	}
				7700
				7701	static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
				7702	{
				7703	percpu_ref_exit(&ref_node->refs);
				7704	kfree(ref_node);
				7705	}
				7706
				7707	static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
				7708	{
				7709	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
				7710	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
				7711	unsigned long flags;
				7712	bool first_add = false;
				7713	unsigned long delay = HZ;
				7714
				7715	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
				7716	node->done = true;
				7717
				7718	/* if we are mid-quiesce then do not delay */
				7719	if (node->rsrc_data->quiesce)
				7720	delay = 0;
				7721
				7722	while (!list_empty(&ctx->rsrc_ref_list)) {
				7723	node = list_first_entry(&ctx->rsrc_ref_list,
				7724	struct io_rsrc_node, node);
				7725	/* recycle ref nodes in order */
				7726	if (!node->done)
				7727	break;
				7728	list_del(&node->node);
				7729	first_add \|= llist_add(&node->llist, &ctx->rsrc_put_llist);
				7730	}
				7731	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
				7732
				7733	if (first_add)
				7734	mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
				7735	}
				7736
				7737	static struct io_rsrc_node io_rsrc_node_alloc(struct io_ring_ctx ctx)
				7738	{
				7739	struct io_rsrc_node *ref_node;
				7740
				7741	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
				7742	if (!ref_node)
				7743	return NULL;
				7744
				7745	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
				7746	0, GFP_KERNEL)) {
				7747	kfree(ref_node);
				7748	return NULL;
				7749	}
				7750	INIT_LIST_HEAD(&ref_node->node);
				7751	INIT_LIST_HEAD(&ref_node->rsrc_list);
				7752	ref_node->done = false;
				7753	return ref_node;
				7754	}
				7755
				7756	static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
				7757	struct io_rsrc_data *data_to_kill)
				7758	{
				7759	WARN_ON_ONCE(!ctx->rsrc_backup_node);
				7760	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
				7761
				7762	if (data_to_kill) {
				7763	struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
				7764
				7765	rsrc_node->rsrc_data = data_to_kill;
				7766	spin_lock_irq(&ctx->rsrc_ref_lock);
				7767	list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
				7768	spin_unlock_irq(&ctx->rsrc_ref_lock);
				7769
				7770	atomic_inc(&data_to_kill->refs);
				7771	percpu_ref_kill(&rsrc_node->refs);
				7772	ctx->rsrc_node = NULL;
				7773	}
				7774
				7775	if (!ctx->rsrc_node) {
				7776	ctx->rsrc_node = ctx->rsrc_backup_node;
				7777	ctx->rsrc_backup_node = NULL;
				7778	}
				7779	}
				7780
				7781	static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
				7782	{
				7783	if (ctx->rsrc_backup_node)
				7784	return 0;
				7785	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
				7786	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
				7787	}
				7788
				7789	static int io_rsrc_ref_quiesce(struct io_rsrc_data data, struct io_ring_ctx ctx)
				7790	{
				7791	int ret;
				7792
				7793	/* As we may drop ->uring_lock, other task may have started quiesce */
				7794	if (data->quiesce)
				7795	return -ENXIO;
				7796
				7797	data->quiesce = true;
				7798	do {
				7799	ret = io_rsrc_node_switch_start(ctx);
				7800	if (ret)
				7801	break;
				7802	io_rsrc_node_switch(ctx, data);
				7803
				7804	/* kill initial ref, already quiesced if zero */
				7805	if (atomic_dec_and_test(&data->refs))
				7806	break;
				7807	mutex_unlock(&ctx->uring_lock);
				7808	flush_delayed_work(&ctx->rsrc_put_work);
				7809	ret = wait_for_completion_interruptible(&data->done);
				7810	if (!ret) {
				7811	mutex_lock(&ctx->uring_lock);
				7812	if (atomic_read(&data->refs) > 0) {
				7813	/*
				7814	* it has been revived by another thread while
				7815	* we were unlocked
				7816	*/
				7817	mutex_unlock(&ctx->uring_lock);
				7818	} else {
				7819	break;
				7820	}
				7821	}
				7822
				7823	atomic_inc(&data->refs);
				7824	/* wait for all works potentially completing data->done */
				7825	flush_delayed_work(&ctx->rsrc_put_work);
				7826	reinit_completion(&data->done);
				7827
				7828	ret = io_run_task_work_sig();
				7829	mutex_lock(&ctx->uring_lock);
				7830	} while (ret >= 0);
				7831	data->quiesce = false;
				7832
				7833	return ret;
				7834	}
				7835
				7836	static u64 io_get_tag_slot(struct io_rsrc_data data, unsigned int idx)
				7837	{
				7838	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
				7839	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
				7840
				7841	return &data->tags[table_idx][off];
				7842	}
				7843
				7844	static void io_rsrc_data_free(struct io_rsrc_data *data)
				7845	{
				7846	size_t size = data->nr * sizeof(data->tags[0][0]);
				7847
				7848	if (data->tags)
				7849	io_free_page_table((void **)data->tags, size);
				7850	kfree(data);
				7851	}
				7852
				7853	static int io_rsrc_data_alloc(struct io_ring_ctx ctx, rsrc_put_fn do_put,
				7854	u64 __user *utags, unsigned nr,
				7855	struct io_rsrc_data **pdata)
				7856	{
				7857	struct io_rsrc_data *data;
				7858	int ret = -ENOMEM;
				7859	unsigned i;
				7860
				7861	data = kzalloc(sizeof(*data), GFP_KERNEL);
				7862	if (!data)
				7863	return -ENOMEM;
				7864	data->tags = (u64 *)io_alloc_page_table(nr sizeof(data->tags[0][0]));
				7865	if (!data->tags) {
				7866	kfree(data);
				7867	return -ENOMEM;
				7868	}
				7869
				7870	data->nr = nr;
				7871	data->ctx = ctx;
				7872	data->do_put = do_put;
				7873	if (utags) {
				7874	ret = -EFAULT;
				7875	for (i = 0; i < nr; i++) {
				7876	u64 *tag_slot = io_get_tag_slot(data, i);
				7877
				7878	if (copy_from_user(tag_slot, &utags[i],
				7879	sizeof(*tag_slot)))
				7880	goto fail;
				7881	}
				7882	}
				7883
				7884	atomic_set(&data->refs, 1);
				7885	init_completion(&data->done);
				7886	*pdata = data;
				7887	return 0;
				7888	fail:
				7889	io_rsrc_data_free(data);
				7890	return ret;
				7891	}
				7892
				7893	static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
				7894	{
				7895	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
				7896	GFP_KERNEL_ACCOUNT);
				7897	return !!table->files;
				7898	}
				7899
				7900	static void io_free_file_tables(struct io_file_table *table)
				7901	{
				7902	kvfree(table->files);
				7903	table->files = NULL;
				7904	}
				7905
				7906	static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
				7907	{
				7908	#if defined(CONFIG_UNIX)
				7909	if (ctx->ring_sock) {
				7910	struct sock *sock = ctx->ring_sock->sk;
				7911	struct sk_buff *skb;
				7912
				7913	while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
				7914	kfree_skb(skb);
				7915	}
				7916	#else
				7917	int i;
				7918
				7919	for (i = 0; i < ctx->nr_user_files; i++) {
				7920	struct file *file;
				7921
				7922	file = io_file_from_index(ctx, i);
				7923	if (file)
				7924	fput(file);
				7925	}
				7926	#endif
				7927	io_free_file_tables(&ctx->file_table);
				7928	io_rsrc_data_free(ctx->file_data);
				7929	ctx->file_data = NULL;
				7930	ctx->nr_user_files = 0;
				7931	}
				7932
				7933	static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
				7934	{
				7935	unsigned nr = ctx->nr_user_files;
				7936	int ret;
				7937
				7938	if (!ctx->file_data)
				7939	return -ENXIO;
				7940
				7941	/*
				7942	* Quiesce may unlock ->uring_lock, and while it's not held
				7943	* prevent new requests using the table.
				7944	*/
				7945	ctx->nr_user_files = 0;
				7946	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
				7947	ctx->nr_user_files = nr;
				7948	if (!ret)
				7949	__io_sqe_files_unregister(ctx);
				7950	return ret;
				7951	}
				7952
				7953	static void io_sq_thread_unpark(struct io_sq_data *sqd)
				7954	__releases(&sqd->lock)
				7955	{
				7956	WARN_ON_ONCE(sqd->thread == current);
				7957
				7958	/*
				7959	* Do the dance but not conditional clear_bit() because it'd race with
				7960	* other threads incrementing park_pending and setting the bit.
				7961	*/
				7962	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
				7963	if (atomic_dec_return(&sqd->park_pending))
				7964	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
				7965	mutex_unlock(&sqd->lock);
				7966	}
				7967
				7968	static void io_sq_thread_park(struct io_sq_data *sqd)
				7969	__acquires(&sqd->lock)
				7970	{
				7971	WARN_ON_ONCE(sqd->thread == current);
				7972
				7973	atomic_inc(&sqd->park_pending);
				7974	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
				7975	mutex_lock(&sqd->lock);
				7976	if (sqd->thread)
				7977	wake_up_process(sqd->thread);
				7978	}
				7979
				7980	static void io_sq_thread_stop(struct io_sq_data *sqd)
				7981	{
				7982	WARN_ON_ONCE(sqd->thread == current);
				7983	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
				7984
				7985	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
				7986	mutex_lock(&sqd->lock);
				7987	if (sqd->thread)
				7988	wake_up_process(sqd->thread);
				7989	mutex_unlock(&sqd->lock);
				7990	wait_for_completion(&sqd->exited);
				7991	}
				7992
				7993	static void io_put_sq_data(struct io_sq_data *sqd)
				7994	{
				7995	if (refcount_dec_and_test(&sqd->refs)) {
				7996	WARN_ON_ONCE(atomic_read(&sqd->park_pending));
				7997
				7998	io_sq_thread_stop(sqd);
				7999	kfree(sqd);
				8000	}
				8001	}
				8002
				8003	static void io_sq_thread_finish(struct io_ring_ctx *ctx)
				8004	{
				8005	struct io_sq_data *sqd = ctx->sq_data;
				8006
				8007	if (sqd) {
				8008	io_sq_thread_park(sqd);
				8009	list_del_init(&ctx->sqd_list);
				8010	io_sqd_update_thread_idle(sqd);
				8011	io_sq_thread_unpark(sqd);
				8012
				8013	io_put_sq_data(sqd);
				8014	ctx->sq_data = NULL;
				8015	}
				8016	}
				8017
				8018	static struct io_sq_data io_attach_sq_data(struct io_uring_params p)
				8019	{
				8020	struct io_ring_ctx *ctx_attach;
				8021	struct io_sq_data *sqd;
				8022	struct fd f;
				8023
				8024	f = fdget(p->wq_fd);
				8025	if (!f.file)
				8026	return ERR_PTR(-ENXIO);
				8027	if (f.file->f_op != &io_uring_fops) {
				8028	fdput(f);
				8029	return ERR_PTR(-EINVAL);
				8030	}
				8031
				8032	ctx_attach = f.file->private_data;
				8033	sqd = ctx_attach->sq_data;
				8034	if (!sqd) {
				8035	fdput(f);
				8036	return ERR_PTR(-EINVAL);
				8037	}
				8038	if (sqd->task_tgid != current->tgid) {
				8039	fdput(f);
				8040	return ERR_PTR(-EPERM);
				8041	}
				8042
				8043	refcount_inc(&sqd->refs);
				8044	fdput(f);
				8045	return sqd;
				8046	}
				8047
				8048	static struct io_sq_data io_get_sq_data(struct io_uring_params p,
				8049	bool *attached)
				8050	{
				8051	struct io_sq_data *sqd;
				8052
				8053	*attached = false;
				8054	if (p->flags & IORING_SETUP_ATTACH_WQ) {
				8055	sqd = io_attach_sq_data(p);
				8056	if (!IS_ERR(sqd)) {
				8057	*attached = true;
				8058	return sqd;
				8059	}
				8060	/* fall through for EPERM case, setup new sqd/task */
				8061	if (PTR_ERR(sqd) != -EPERM)
				8062	return sqd;
				8063	}
				8064
				8065	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
				8066	if (!sqd)
				8067	return ERR_PTR(-ENOMEM);
				8068
				8069	atomic_set(&sqd->park_pending, 0);
				8070	refcount_set(&sqd->refs, 1);
				8071	INIT_LIST_HEAD(&sqd->ctx_list);
				8072	mutex_init(&sqd->lock);
				8073	init_waitqueue_head(&sqd->wait);
				8074	init_completion(&sqd->exited);
				8075	return sqd;
				8076	}
				8077
				8078	#if defined(CONFIG_UNIX)
				8079	/*
				8080	* Ensure the UNIX gc is aware of our file set, so we are certain that
				8081	* the io_uring can be safely unregistered on process exit, even if we have
				8082	* loops in the file referencing.
				8083	*/
				8084	static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
				8085	{
				8086	struct sock *sk = ctx->ring_sock->sk;
				8087	struct scm_fp_list *fpl;
				8088	struct sk_buff *skb;
				8089	int i, nr_files;
				8090
				8091	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
				8092	if (!fpl)
				8093	return -ENOMEM;
				8094
				8095	skb = alloc_skb(0, GFP_KERNEL);
				8096	if (!skb) {
				8097	kfree(fpl);
				8098	return -ENOMEM;
				8099	}
				8100
				8101	skb->sk = sk;
				8102	skb->scm_io_uring = 1;
				8103
				8104	nr_files = 0;
				8105	fpl->user = get_uid(current_user());
				8106	for (i = 0; i < nr; i++) {
				8107	struct file *file = io_file_from_index(ctx, i + offset);
				8108
				8109	if (!file)
				8110	continue;
				8111	fpl->fp[nr_files] = get_file(file);
				8112	unix_inflight(fpl->user, fpl->fp[nr_files]);
				8113	nr_files++;
				8114	}
				8115
				8116	if (nr_files) {
				8117	fpl->max = SCM_MAX_FD;
				8118	fpl->count = nr_files;
				8119	UNIXCB(skb).fp = fpl;
				8120	skb->destructor = unix_destruct_scm;
				8121	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
				8122	skb_queue_head(&sk->sk_receive_queue, skb);
				8123
				8124	for (i = 0; i < nr; i++) {
				8125	struct file *file = io_file_from_index(ctx, i + offset);
				8126
				8127	if (file)
				8128	fput(file);
				8129	}
				8130	} else {
				8131	kfree_skb(skb);
				8132	free_uid(fpl->user);
				8133	kfree(fpl);
				8134	}
				8135
				8136	return 0;
				8137	}
				8138
				8139	/*
				8140	* If UNIX sockets are enabled, fd passing can cause a reference cycle which
				8141	* causes regular reference counting to break down. We rely on the UNIX
				8142	* garbage collection to take care of this problem for us.
				8143	*/
				8144	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				8145	{
				8146	unsigned left, total;
				8147	int ret = 0;
				8148
				8149	total = 0;
				8150	left = ctx->nr_user_files;
				8151	while (left) {
				8152	unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
				8153
				8154	ret = __io_sqe_files_scm(ctx, this_files, total);
				8155	if (ret)
				8156	break;
				8157	left -= this_files;
				8158	total += this_files;
				8159	}
				8160
				8161	if (!ret)
				8162	return 0;
				8163
				8164	while (total < ctx->nr_user_files) {
				8165	struct file *file = io_file_from_index(ctx, total);
				8166
				8167	if (file)
				8168	fput(file);
				8169	total++;
				8170	}
				8171
				8172	return ret;
				8173	}
				8174	#else
				8175	static int io_sqe_files_scm(struct io_ring_ctx *ctx)
				8176	{
				8177	return 0;
				8178	}
				8179	#endif
				8180
				8181	static void io_rsrc_file_put(struct io_ring_ctx ctx, struct io_rsrc_put prsrc)
				8182	{
				8183	struct file *file = prsrc->file;
				8184	#if defined(CONFIG_UNIX)
				8185	struct sock *sock = ctx->ring_sock->sk;
				8186	struct sk_buff_head list, *head = &sock->sk_receive_queue;
				8187	struct sk_buff *skb;
				8188	int i;
				8189
				8190	__skb_queue_head_init(&list);
				8191
				8192	/*
				8193	* Find the skb that holds this file in its SCM_RIGHTS. When found,
				8194	* remove this entry and rearrange the file array.
				8195	*/
				8196	skb = skb_dequeue(head);
				8197	while (skb) {
				8198	struct scm_fp_list *fp;
				8199
				8200	fp = UNIXCB(skb).fp;
				8201	for (i = 0; i < fp->count; i++) {
				8202	int left;
				8203
				8204	if (fp->fp[i] != file)
				8205	continue;
				8206
				8207	unix_notinflight(fp->user, fp->fp[i]);
				8208	left = fp->count - 1 - i;
				8209	if (left) {
				8210	memmove(&fp->fp[i], &fp->fp[i + 1],
				8211	left * sizeof(struct file *));
				8212	}
				8213	fp->count--;
				8214	if (!fp->count) {
				8215	kfree_skb(skb);
				8216	skb = NULL;
				8217	} else {
				8218	__skb_queue_tail(&list, skb);
				8219	}
				8220	fput(file);
				8221	file = NULL;
				8222	break;
				8223	}
				8224
				8225	if (!file)
				8226	break;
				8227
				8228	__skb_queue_tail(&list, skb);
				8229
				8230	skb = skb_dequeue(head);
				8231	}
				8232
				8233	if (skb_peek(&list)) {
				8234	spin_lock_irq(&head->lock);
				8235	while ((skb = __skb_dequeue(&list)) != NULL)
				8236	__skb_queue_tail(head, skb);
				8237	spin_unlock_irq(&head->lock);
				8238	}
				8239	#else
				8240	fput(file);
				8241	#endif
				8242	}
				8243
				8244	static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
				8245	{
				8246	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
				8247	struct io_ring_ctx *ctx = rsrc_data->ctx;
				8248	struct io_rsrc_put prsrc, tmp;
				8249
				8250	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
				8251	list_del(&prsrc->list);
				8252
				8253	if (prsrc->tag) {
				8254	bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
				8255
				8256	io_ring_submit_lock(ctx, lock_ring);
				8257	spin_lock(&ctx->completion_lock);
				8258	io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
				8259	io_commit_cqring(ctx);
				8260	spin_unlock(&ctx->completion_lock);
				8261	io_cqring_ev_posted(ctx);
				8262	io_ring_submit_unlock(ctx, lock_ring);
				8263	}
				8264
				8265	rsrc_data->do_put(ctx, prsrc);
				8266	kfree(prsrc);
				8267	}
				8268
				8269	io_rsrc_node_destroy(ref_node);
				8270	if (atomic_dec_and_test(&rsrc_data->refs))
				8271	complete(&rsrc_data->done);
				8272	}
				8273
				8274	static void io_rsrc_put_work(struct work_struct *work)
				8275	{
				8276	struct io_ring_ctx *ctx;
				8277	struct llist_node *node;
				8278
				8279	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
				8280	node = llist_del_all(&ctx->rsrc_put_llist);
				8281
				8282	while (node) {
				8283	struct io_rsrc_node *ref_node;
				8284	struct llist_node *next = node->next;
				8285
				8286	ref_node = llist_entry(node, struct io_rsrc_node, llist);
				8287	__io_rsrc_put_work(ref_node);
				8288	node = next;
				8289	}
				8290	}
				8291
				8292	static int io_sqe_files_register(struct io_ring_ctx ctx, void __user arg,
				8293	unsigned nr_args, u64 __user *tags)
				8294	{
				8295	__s32 __user fds = (__s32 __user ) arg;
				8296	struct file *file;
				8297	int fd, ret;
				8298	unsigned i;
				8299
				8300	if (ctx->file_data)
				8301	return -EBUSY;
				8302	if (!nr_args)
				8303	return -EINVAL;
				8304	if (nr_args > IORING_MAX_FIXED_FILES)
				8305	return -EMFILE;
				8306	if (nr_args > rlimit(RLIMIT_NOFILE))
				8307	return -EMFILE;
				8308	ret = io_rsrc_node_switch_start(ctx);
				8309	if (ret)
				8310	return ret;
				8311	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
				8312	&ctx->file_data);
				8313	if (ret)
				8314	return ret;
				8315
				8316	ret = -ENOMEM;
				8317	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
				8318	goto out_free;
				8319
				8320	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
				8321	if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
				8322	ret = -EFAULT;
				8323	goto out_fput;
				8324	}
				8325	/* allow sparse sets */
				8326	if (fd == -1) {
				8327	ret = -EINVAL;
				8328	if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
				8329	goto out_fput;
				8330	continue;
				8331	}
				8332
				8333	file = fget(fd);
				8334	ret = -EBADF;
				8335	if (unlikely(!file))
				8336	goto out_fput;
				8337
				8338	/*
				8339	* Don't allow io_uring instances to be registered. If UNIX
				8340	* isn't enabled, then this causes a reference cycle and this
				8341	* instance can never get freed. If UNIX is enabled we'll
				8342	* handle it just fine, but there's still no point in allowing
				8343	* a ring fd as it doesn't support regular read/write anyway.
				8344	*/
				8345	if (file->f_op == &io_uring_fops) {
				8346	fput(file);
				8347	goto out_fput;
				8348	}
				8349	io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
				8350	}
				8351
				8352	ret = io_sqe_files_scm(ctx);
				8353	if (ret) {
				8354	__io_sqe_files_unregister(ctx);
				8355	return ret;
				8356	}
				8357
				8358	io_rsrc_node_switch(ctx, NULL);
				8359	return ret;
				8360	out_fput:
				8361	for (i = 0; i < ctx->nr_user_files; i++) {
				8362	file = io_file_from_index(ctx, i);
				8363	if (file)
				8364	fput(file);
				8365	}
				8366	io_free_file_tables(&ctx->file_table);
				8367	ctx->nr_user_files = 0;
				8368	out_free:
				8369	io_rsrc_data_free(ctx->file_data);
				8370	ctx->file_data = NULL;
				8371	return ret;
				8372	}
				8373
				8374	static int io_sqe_file_register(struct io_ring_ctx ctx, struct file file,
				8375	int index)
				8376	{
				8377	#if defined(CONFIG_UNIX)
				8378	struct sock *sock = ctx->ring_sock->sk;
				8379	struct sk_buff_head *head = &sock->sk_receive_queue;
				8380	struct sk_buff *skb;
				8381
				8382	/*
				8383	* See if we can merge this file into an existing skb SCM_RIGHTS
				8384	* file set. If there's no room, fall back to allocating a new skb
				8385	* and filling it in.
				8386	*/
				8387	spin_lock_irq(&head->lock);
				8388	skb = skb_peek(head);
				8389	if (skb) {
				8390	struct scm_fp_list *fpl = UNIXCB(skb).fp;
				8391
				8392	if (fpl->count < SCM_MAX_FD) {
				8393	__skb_unlink(skb, head);
				8394	spin_unlock_irq(&head->lock);
				8395	fpl->fp[fpl->count] = get_file(file);
				8396	unix_inflight(fpl->user, fpl->fp[fpl->count]);
				8397	fpl->count++;
				8398	spin_lock_irq(&head->lock);
				8399	__skb_queue_head(head, skb);
				8400	} else {
				8401	skb = NULL;
				8402	}
				8403	}
				8404	spin_unlock_irq(&head->lock);
				8405
				8406	if (skb) {
				8407	fput(file);
				8408	return 0;
				8409	}
				8410
				8411	return __io_sqe_files_scm(ctx, 1, index);
				8412	#else
				8413	return 0;
				8414	#endif
				8415	}
				8416
				8417	static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
				8418	struct io_rsrc_node node, void rsrc)
				8419	{
				8420	u64 *tag_slot = io_get_tag_slot(data, idx);
				8421	struct io_rsrc_put *prsrc;
				8422
				8423	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
				8424	if (!prsrc)
				8425	return -ENOMEM;
				8426
				8427	prsrc->tag = *tag_slot;
				8428	*tag_slot = 0;
				8429	prsrc->rsrc = rsrc;
				8430	list_add(&prsrc->list, &node->rsrc_list);
				8431	return 0;
				8432	}
				8433
				8434	static int io_install_fixed_file(struct io_kiocb req, struct file file,
				8435	unsigned int issue_flags, u32 slot_index)
				8436	{
				8437	struct io_ring_ctx *ctx = req->ctx;
				8438	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
				8439	bool needs_switch = false;
				8440	struct io_fixed_file *file_slot;
				8441	int ret = -EBADF;
				8442
				8443	io_ring_submit_lock(ctx, !force_nonblock);
				8444	if (file->f_op == &io_uring_fops)
				8445	goto err;
				8446	ret = -ENXIO;
				8447	if (!ctx->file_data)
				8448	goto err;
				8449	ret = -EINVAL;
				8450	if (slot_index >= ctx->nr_user_files)
				8451	goto err;
				8452
				8453	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
				8454	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
				8455
				8456	if (file_slot->file_ptr) {
				8457	struct file *old_file;
				8458
				8459	ret = io_rsrc_node_switch_start(ctx);
				8460	if (ret)
				8461	goto err;
				8462
				8463	old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
				8464	ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
				8465	ctx->rsrc_node, old_file);
				8466	if (ret)
				8467	goto err;
				8468	file_slot->file_ptr = 0;
				8469	needs_switch = true;
				8470	}
				8471
				8472	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
				8473	io_fixed_file_set(file_slot, file);
				8474	ret = io_sqe_file_register(ctx, file, slot_index);
				8475	if (ret) {
				8476	file_slot->file_ptr = 0;
				8477	goto err;
				8478	}
				8479
				8480	ret = 0;
				8481	err:
				8482	if (needs_switch)
				8483	io_rsrc_node_switch(ctx, ctx->file_data);
				8484	io_ring_submit_unlock(ctx, !force_nonblock);
				8485	if (ret)
				8486	fput(file);
				8487	return ret;
				8488	}
				8489
				8490	static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
				8491	{
				8492	unsigned int offset = req->close.file_slot - 1;
				8493	struct io_ring_ctx *ctx = req->ctx;
				8494	struct io_fixed_file *file_slot;
				8495	struct file *file;
				8496	int ret;
				8497
				8498	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				8499	ret = -ENXIO;
				8500	if (unlikely(!ctx->file_data))
				8501	goto out;
				8502	ret = -EINVAL;
				8503	if (offset >= ctx->nr_user_files)
				8504	goto out;
				8505	ret = io_rsrc_node_switch_start(ctx);
				8506	if (ret)
				8507	goto out;
				8508
				8509	offset = array_index_nospec(offset, ctx->nr_user_files);
				8510	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
				8511	ret = -EBADF;
				8512	if (!file_slot->file_ptr)
				8513	goto out;
				8514
				8515	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
				8516	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
				8517	if (ret)
				8518	goto out;
				8519
				8520	file_slot->file_ptr = 0;
				8521	io_rsrc_node_switch(ctx, ctx->file_data);
				8522	ret = 0;
				8523	out:
				8524	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
				8525	return ret;
				8526	}
				8527
				8528	static int __io_sqe_files_update(struct io_ring_ctx *ctx,
				8529	struct io_uring_rsrc_update2 *up,
				8530	unsigned nr_args)
				8531	{
				8532	u64 __user *tags = u64_to_user_ptr(up->tags);
				8533	__s32 __user *fds = u64_to_user_ptr(up->data);
				8534	struct io_rsrc_data *data = ctx->file_data;
				8535	struct io_fixed_file *file_slot;
				8536	struct file *file;
				8537	int fd, i, err = 0;
				8538	unsigned int done;
				8539	bool needs_switch = false;
				8540
				8541	if (!ctx->file_data)
				8542	return -ENXIO;
				8543	if (up->offset + nr_args > ctx->nr_user_files)
				8544	return -EINVAL;
				8545
				8546	for (done = 0; done < nr_args; done++) {
				8547	u64 tag = 0;
				8548
				8549	if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) \|\|
				8550	copy_from_user(&fd, &fds[done], sizeof(fd))) {
				8551	err = -EFAULT;
				8552	break;
				8553	}
				8554	if ((fd == IORING_REGISTER_FILES_SKIP \|\| fd == -1) && tag) {
				8555	err = -EINVAL;
				8556	break;
				8557	}
				8558	if (fd == IORING_REGISTER_FILES_SKIP)
				8559	continue;
				8560
				8561	i = array_index_nospec(up->offset + done, ctx->nr_user_files);
				8562	file_slot = io_fixed_file_slot(&ctx->file_table, i);
				8563
				8564	if (file_slot->file_ptr) {
				8565	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
				8566	err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
				8567	if (err)
				8568	break;
				8569	file_slot->file_ptr = 0;
				8570	needs_switch = true;
				8571	}
				8572	if (fd != -1) {
				8573	file = fget(fd);
				8574	if (!file) {
				8575	err = -EBADF;
				8576	break;
				8577	}
				8578	/*
				8579	* Don't allow io_uring instances to be registered. If
				8580	* UNIX isn't enabled, then this causes a reference
				8581	* cycle and this instance can never get freed. If UNIX
				8582	* is enabled we'll handle it just fine, but there's
				8583	* still no point in allowing a ring fd as it doesn't
				8584	* support regular read/write anyway.
				8585	*/
				8586	if (file->f_op == &io_uring_fops) {
				8587	fput(file);
				8588	err = -EBADF;
				8589	break;
				8590	}
				8591	*io_get_tag_slot(data, i) = tag;
				8592	io_fixed_file_set(file_slot, file);
				8593	err = io_sqe_file_register(ctx, file, i);
				8594	if (err) {
				8595	file_slot->file_ptr = 0;
				8596	fput(file);
				8597	break;
				8598	}
				8599	}
				8600	}
				8601
				8602	if (needs_switch)
				8603	io_rsrc_node_switch(ctx, data);
				8604	return done ? done : err;
				8605	}
				8606
				8607	static struct io_wq io_init_wq_offload(struct io_ring_ctx ctx,
				8608	struct task_struct *task)
				8609	{
				8610	struct io_wq_hash *hash;
				8611	struct io_wq_data data;
				8612	unsigned int concurrency;
				8613
				8614	mutex_lock(&ctx->uring_lock);
				8615	hash = ctx->hash_map;
				8616	if (!hash) {
				8617	hash = kzalloc(sizeof(*hash), GFP_KERNEL);
				8618	if (!hash) {
				8619	mutex_unlock(&ctx->uring_lock);
				8620	return ERR_PTR(-ENOMEM);
				8621	}
				8622	refcount_set(&hash->refs, 1);
				8623	init_waitqueue_head(&hash->wait);
				8624	ctx->hash_map = hash;
				8625	}
				8626	mutex_unlock(&ctx->uring_lock);
				8627
				8628	data.hash = hash;
				8629	data.task = task;
				8630	data.free_work = io_wq_free_work;
				8631	data.do_work = io_wq_submit_work;
				8632
				8633	/* Do QD, or 4 * CPUS, whatever is smallest */
				8634	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
				8635
				8636	return io_wq_create(concurrency, &data);
				8637	}
				8638
				8639	static int io_uring_alloc_task_context(struct task_struct *task,
				8640	struct io_ring_ctx *ctx)
				8641	{
				8642	struct io_uring_task *tctx;
				8643	int ret;
				8644
				8645	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
				8646	if (unlikely(!tctx))
				8647	return -ENOMEM;
				8648
				8649	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
				8650	if (unlikely(ret)) {
				8651	kfree(tctx);
				8652	return ret;
				8653	}
				8654
				8655	tctx->io_wq = io_init_wq_offload(ctx, task);
				8656	if (IS_ERR(tctx->io_wq)) {
				8657	ret = PTR_ERR(tctx->io_wq);
				8658	percpu_counter_destroy(&tctx->inflight);
				8659	kfree(tctx);
				8660	return ret;
				8661	}
				8662
				8663	xa_init(&tctx->xa);
				8664	init_waitqueue_head(&tctx->wait);
				8665	atomic_set(&tctx->in_idle, 0);
				8666	atomic_set(&tctx->inflight_tracked, 0);
				8667	task->io_uring = tctx;
				8668	spin_lock_init(&tctx->task_lock);
				8669	INIT_WQ_LIST(&tctx->task_list);
				8670	init_task_work(&tctx->task_work, tctx_task_work);
				8671	return 0;
				8672	}
				8673
				8674	void __io_uring_free(struct task_struct *tsk)
				8675	{
				8676	struct io_uring_task *tctx = tsk->io_uring;
				8677
				8678	WARN_ON_ONCE(!xa_empty(&tctx->xa));
				8679	WARN_ON_ONCE(tctx->io_wq);
				8680	WARN_ON_ONCE(tctx->cached_refs);
				8681
				8682	percpu_counter_destroy(&tctx->inflight);
				8683	kfree(tctx);
				8684	tsk->io_uring = NULL;
				8685	}
				8686
				8687	static int io_sq_offload_create(struct io_ring_ctx *ctx,
				8688	struct io_uring_params *p)
				8689	{
				8690	int ret;
				8691
				8692	/* Retain compatibility with failing for an invalid attach attempt */
				8693	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ \| IORING_SETUP_SQPOLL)) ==
				8694	IORING_SETUP_ATTACH_WQ) {
				8695	struct fd f;
				8696
				8697	f = fdget(p->wq_fd);
				8698	if (!f.file)
				8699	return -ENXIO;
				8700	if (f.file->f_op != &io_uring_fops) {
				8701	fdput(f);
				8702	return -EINVAL;
				8703	}
				8704	fdput(f);
				8705	}
				8706	if (ctx->flags & IORING_SETUP_SQPOLL) {
				8707	struct task_struct *tsk;
				8708	struct io_sq_data *sqd;
				8709	bool attached;
				8710
				8711	sqd = io_get_sq_data(p, &attached);
				8712	if (IS_ERR(sqd)) {
				8713	ret = PTR_ERR(sqd);
				8714	goto err;
				8715	}
				8716
				8717	ctx->sq_creds = get_current_cred();
				8718	ctx->sq_data = sqd;
				8719	ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
				8720	if (!ctx->sq_thread_idle)
				8721	ctx->sq_thread_idle = HZ;
				8722
				8723	io_sq_thread_park(sqd);
				8724	list_add(&ctx->sqd_list, &sqd->ctx_list);
				8725	io_sqd_update_thread_idle(sqd);
				8726	/* don't attach to a dying SQPOLL thread, would be racy */
				8727	ret = (attached && !sqd->thread) ? -ENXIO : 0;
				8728	io_sq_thread_unpark(sqd);
				8729
				8730	if (ret < 0)
				8731	goto err;
				8732	if (attached)
				8733	return 0;
				8734
				8735	if (p->flags & IORING_SETUP_SQ_AFF) {
				8736	int cpu = p->sq_thread_cpu;
				8737
				8738	ret = -EINVAL;
				8739	if (cpu >= nr_cpu_ids \|\| !cpu_online(cpu))
				8740	goto err_sqpoll;
				8741	sqd->sq_cpu = cpu;
				8742	} else {
				8743	sqd->sq_cpu = -1;
				8744	}
				8745
				8746	sqd->task_pid = current->pid;
				8747	sqd->task_tgid = current->tgid;
				8748	tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
				8749	if (IS_ERR(tsk)) {
				8750	ret = PTR_ERR(tsk);
				8751	goto err_sqpoll;
				8752	}
				8753
				8754	sqd->thread = tsk;
				8755	ret = io_uring_alloc_task_context(tsk, ctx);
				8756	wake_up_new_task(tsk);
				8757	if (ret)
				8758	goto err;
				8759	} else if (p->flags & IORING_SETUP_SQ_AFF) {
				8760	/* Can't have SQ_AFF without SQPOLL */
				8761	ret = -EINVAL;
				8762	goto err;
				8763	}
				8764
				8765	return 0;
				8766	err_sqpoll:
				8767	complete(&ctx->sq_data->exited);
				8768	err:
				8769	io_sq_thread_finish(ctx);
				8770	return ret;
				8771	}
				8772
				8773	static inline void __io_unaccount_mem(struct user_struct *user,
				8774	unsigned long nr_pages)
				8775	{
				8776	atomic_long_sub(nr_pages, &user->locked_vm);
				8777	}
				8778
				8779	static inline int __io_account_mem(struct user_struct *user,
				8780	unsigned long nr_pages)
				8781	{
				8782	unsigned long page_limit, cur_pages, new_pages;
				8783
				8784	/* Don't allow more pages than we can safely lock */
				8785	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				8786
				8787	do {
				8788	cur_pages = atomic_long_read(&user->locked_vm);
				8789	new_pages = cur_pages + nr_pages;
				8790	if (new_pages > page_limit)
				8791	return -ENOMEM;
				8792	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
				8793	new_pages) != cur_pages);
				8794
				8795	return 0;
				8796	}
				8797
				8798	static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
				8799	{
				8800	if (ctx->user)
				8801	__io_unaccount_mem(ctx->user, nr_pages);
				8802
				8803	if (ctx->mm_account)
				8804	atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
				8805	}
				8806
				8807	static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
				8808	{
				8809	int ret;
				8810
				8811	if (ctx->user) {
				8812	ret = __io_account_mem(ctx->user, nr_pages);
				8813	if (ret)
				8814	return ret;
				8815	}
				8816
				8817	if (ctx->mm_account)
				8818	atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
				8819
				8820	return 0;
				8821	}
				8822
				8823	static void io_mem_free(void *ptr)
				8824	{
				8825	struct page *page;
				8826
				8827	if (!ptr)
				8828	return;
				8829
				8830	page = virt_to_head_page(ptr);
				8831	if (put_page_testzero(page))
				8832	free_compound_page(page);
				8833	}
				8834
				8835	static void *io_mem_alloc(size_t size)
				8836	{
				8837	gfp_t gfp = GFP_KERNEL_ACCOUNT \| __GFP_ZERO \| __GFP_NOWARN \| __GFP_COMP;
				8838
				8839	return (void *) __get_free_pages(gfp, get_order(size));
				8840	}
				8841
				8842	static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
				8843	size_t *sq_offset)
				8844	{
				8845	struct io_rings *rings;
				8846	size_t off, sq_array_size;
				8847
				8848	off = struct_size(rings, cqes, cq_entries);
				8849	if (off == SIZE_MAX)
				8850	return SIZE_MAX;
				8851
				8852	#ifdef CONFIG_SMP
				8853	off = ALIGN(off, SMP_CACHE_BYTES);
				8854	if (off == 0)
				8855	return SIZE_MAX;
				8856	#endif
				8857
				8858	if (sq_offset)
				8859	*sq_offset = off;
				8860
				8861	sq_array_size = array_size(sizeof(u32), sq_entries);
				8862	if (sq_array_size == SIZE_MAX)
				8863	return SIZE_MAX;
				8864
				8865	if (check_add_overflow(off, sq_array_size, &off))
				8866	return SIZE_MAX;
				8867
				8868	return off;
				8869	}
				8870
				8871	static void io_buffer_unmap(struct io_ring_ctx ctx, struct io_mapped_ubuf *slot)
				8872	{
				8873	struct io_mapped_ubuf imu = slot;
				8874	unsigned int i;
				8875
				8876	if (imu != ctx->dummy_ubuf) {
				8877	for (i = 0; i < imu->nr_bvecs; i++)
				8878	unpin_user_page(imu->bvec[i].bv_page);
				8879	if (imu->acct_pages)
				8880	io_unaccount_mem(ctx, imu->acct_pages);
				8881	kvfree(imu);
				8882	}
				8883	*slot = NULL;
				8884	}
				8885
				8886	static void io_rsrc_buf_put(struct io_ring_ctx ctx, struct io_rsrc_put prsrc)
				8887	{
				8888	io_buffer_unmap(ctx, &prsrc->buf);
				8889	prsrc->buf = NULL;
				8890	}
				8891
				8892	static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
				8893	{
				8894	unsigned int i;
				8895
				8896	for (i = 0; i < ctx->nr_user_bufs; i++)
				8897	io_buffer_unmap(ctx, &ctx->user_bufs[i]);
				8898	kfree(ctx->user_bufs);
				8899	io_rsrc_data_free(ctx->buf_data);
				8900	ctx->user_bufs = NULL;
				8901	ctx->buf_data = NULL;
				8902	ctx->nr_user_bufs = 0;
				8903	}
				8904
				8905	static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
				8906	{
				8907	unsigned nr = ctx->nr_user_bufs;
				8908	int ret;
				8909
				8910	if (!ctx->buf_data)
				8911	return -ENXIO;
				8912
				8913	/*
				8914	* Quiesce may unlock ->uring_lock, and while it's not held
				8915	* prevent new requests using the table.
				8916	*/
				8917	ctx->nr_user_bufs = 0;
				8918	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
				8919	ctx->nr_user_bufs = nr;
				8920	if (!ret)
				8921	__io_sqe_buffers_unregister(ctx);
				8922	return ret;
				8923	}
				8924
				8925	static int io_copy_iov(struct io_ring_ctx ctx, struct iovec dst,
				8926	void __user *arg, unsigned index)
				8927	{
				8928	struct iovec __user *src;
				8929
				8930	#ifdef CONFIG_COMPAT
				8931	if (ctx->compat) {
				8932	struct compat_iovec __user *ciovs;
				8933	struct compat_iovec ciov;
				8934
				8935	ciovs = (struct compat_iovec __user *) arg;
				8936	if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
				8937	return -EFAULT;
				8938
				8939	dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
				8940	dst->iov_len = ciov.iov_len;
				8941	return 0;
				8942	}
				8943	#endif
				8944	src = (struct iovec __user *) arg;
				8945	if (copy_from_user(dst, &src[index], sizeof(*dst)))
				8946	return -EFAULT;
				8947	return 0;
				8948	}
				8949
				8950	/*
				8951	* Not super efficient, but this is just a registration time. And we do cache
				8952	* the last compound head, so generally we'll only do a full search if we don't
				8953	* match that one.
				8954	*
				8955	* We check if the given compound head page has already been accounted, to
				8956	* avoid double accounting it. This allows us to account the full size of the
				8957	* page, not just the constituent pages of a huge page.
				8958	*/
				8959	static bool headpage_already_acct(struct io_ring_ctx ctx, struct page *pages,
				8960	int nr_pages, struct page *hpage)
				8961	{
				8962	int i, j;
				8963
				8964	/* check current page array */
				8965	for (i = 0; i < nr_pages; i++) {
				8966	if (!PageCompound(pages[i]))
				8967	continue;
				8968	if (compound_head(pages[i]) == hpage)
				8969	return true;
				8970	}
				8971
				8972	/* check previously registered pages */
				8973	for (i = 0; i < ctx->nr_user_bufs; i++) {
				8974	struct io_mapped_ubuf *imu = ctx->user_bufs[i];
				8975
				8976	for (j = 0; j < imu->nr_bvecs; j++) {
				8977	if (!PageCompound(imu->bvec[j].bv_page))
				8978	continue;
				8979	if (compound_head(imu->bvec[j].bv_page) == hpage)
				8980	return true;
				8981	}
				8982	}
				8983
				8984	return false;
				8985	}
				8986
				8987	static int io_buffer_account_pin(struct io_ring_ctx ctx, struct page *pages,
				8988	int nr_pages, struct io_mapped_ubuf *imu,
				8989	struct page **last_hpage)
				8990	{
				8991	int i, ret;
				8992
				8993	imu->acct_pages = 0;
				8994	for (i = 0; i < nr_pages; i++) {
				8995	if (!PageCompound(pages[i])) {
				8996	imu->acct_pages++;
				8997	} else {
				8998	struct page *hpage;
				8999
				9000	hpage = compound_head(pages[i]);
				9001	if (hpage == *last_hpage)
				9002	continue;
				9003	*last_hpage = hpage;
				9004	if (headpage_already_acct(ctx, pages, i, hpage))
				9005	continue;
				9006	imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
				9007	}
				9008	}
				9009
				9010	if (!imu->acct_pages)
				9011	return 0;
				9012
				9013	ret = io_account_mem(ctx, imu->acct_pages);
				9014	if (ret)
				9015	imu->acct_pages = 0;
				9016	return ret;
				9017	}
				9018
				9019	static int io_sqe_buffer_register(struct io_ring_ctx ctx, struct iovec iov,
				9020	struct io_mapped_ubuf **pimu,
				9021	struct page **last_hpage)
				9022	{
				9023	struct io_mapped_ubuf *imu = NULL;
				9024	struct vm_area_struct **vmas = NULL;
				9025	struct page **pages = NULL;
				9026	unsigned long off, start, end, ubuf;
				9027	size_t size;
				9028	int ret, pret, nr_pages, i;
				9029
				9030	if (!iov->iov_base) {
				9031	*pimu = ctx->dummy_ubuf;
				9032	return 0;
				9033	}
				9034
				9035	ubuf = (unsigned long) iov->iov_base;
				9036	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				9037	start = ubuf >> PAGE_SHIFT;
				9038	nr_pages = end - start;
				9039
				9040	*pimu = NULL;
				9041	ret = -ENOMEM;
				9042
				9043	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
				9044	if (!pages)
				9045	goto done;
				9046
				9047	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
				9048	GFP_KERNEL);
				9049	if (!vmas)
				9050	goto done;
				9051
				9052	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
				9053	if (!imu)
				9054	goto done;
				9055
				9056	ret = 0;
				9057	mmap_read_lock(current->mm);
				9058	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE \| FOLL_LONGTERM,
				9059	pages, vmas);
				9060	if (pret == nr_pages) {
				9061	/* don't support file backed memory */
				9062	for (i = 0; i < nr_pages; i++) {
				9063	struct vm_area_struct *vma = vmas[i];
				9064
				9065	if (vma_is_shmem(vma))
				9066	continue;
				9067	if (vma->vm_file &&
				9068	!is_file_hugepages(vma->vm_file)) {
				9069	ret = -EOPNOTSUPP;
				9070	break;
				9071	}
				9072	}
				9073	} else {
				9074	ret = pret < 0 ? pret : -EFAULT;
				9075	}
				9076	mmap_read_unlock(current->mm);
				9077	if (ret) {
				9078	/*
				9079	* if we did partial map, or found file backed vmas,
				9080	* release any pages we did get
				9081	*/
				9082	if (pret > 0)
				9083	unpin_user_pages(pages, pret);
				9084	goto done;
				9085	}
				9086
				9087	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
				9088	if (ret) {
				9089	unpin_user_pages(pages, pret);
				9090	goto done;
				9091	}
				9092
				9093	off = ubuf & ~PAGE_MASK;
				9094	size = iov->iov_len;
				9095	for (i = 0; i < nr_pages; i++) {
				9096	size_t vec_len;
				9097
				9098	vec_len = min_t(size_t, size, PAGE_SIZE - off);
				9099	imu->bvec[i].bv_page = pages[i];
				9100	imu->bvec[i].bv_len = vec_len;
				9101	imu->bvec[i].bv_offset = off;
				9102	off = 0;
				9103	size -= vec_len;
				9104	}
				9105	/* store original address for later verification */
				9106	imu->ubuf = ubuf;
				9107	imu->ubuf_end = ubuf + iov->iov_len;
				9108	imu->nr_bvecs = nr_pages;
				9109	*pimu = imu;
				9110	ret = 0;
				9111	done:
				9112	if (ret)
				9113	kvfree(imu);
				9114	kvfree(pages);
				9115	kvfree(vmas);
				9116	return ret;
				9117	}
				9118
				9119	static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
				9120	{
				9121	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
				9122	return ctx->user_bufs ? 0 : -ENOMEM;
				9123	}
				9124
				9125	static int io_buffer_validate(struct iovec *iov)
				9126	{
				9127	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
				9128
				9129	/*
				9130	* Don't impose further limits on the size and buffer
				9131	* constraints here, we'll -EINVAL later when IO is
				9132	* submitted if they are wrong.
				9133	*/
				9134	if (!iov->iov_base)
				9135	return iov->iov_len ? -EFAULT : 0;
				9136	if (!iov->iov_len)
				9137	return -EFAULT;
				9138
				9139	/* arbitrary limit, but we need something */
				9140	if (iov->iov_len > SZ_1G)
				9141	return -EFAULT;
				9142
				9143	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
				9144	return -EOVERFLOW;
				9145
				9146	return 0;
				9147	}
				9148
				9149	static int io_sqe_buffers_register(struct io_ring_ctx ctx, void __user arg,
				9150	unsigned int nr_args, u64 __user *tags)
				9151	{
				9152	struct page *last_hpage = NULL;
				9153	struct io_rsrc_data *data;
				9154	int i, ret;
				9155	struct iovec iov;
				9156
				9157	if (ctx->user_bufs)
				9158	return -EBUSY;
				9159	if (!nr_args \|\| nr_args > IORING_MAX_REG_BUFFERS)
				9160	return -EINVAL;
				9161	ret = io_rsrc_node_switch_start(ctx);
				9162	if (ret)
				9163	return ret;
				9164	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
				9165	if (ret)
				9166	return ret;
				9167	ret = io_buffers_map_alloc(ctx, nr_args);
				9168	if (ret) {
				9169	io_rsrc_data_free(data);
				9170	return ret;
				9171	}
				9172
				9173	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
				9174	ret = io_copy_iov(ctx, &iov, arg, i);
				9175	if (ret)
				9176	break;
				9177	ret = io_buffer_validate(&iov);
				9178	if (ret)
				9179	break;
				9180	if (!iov.iov_base && *io_get_tag_slot(data, i)) {
				9181	ret = -EINVAL;
				9182	break;
				9183	}
				9184
				9185	ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
				9186	&last_hpage);
				9187	if (ret)
				9188	break;
				9189	}
				9190
				9191	WARN_ON_ONCE(ctx->buf_data);
				9192
				9193	ctx->buf_data = data;
				9194	if (ret)
				9195	__io_sqe_buffers_unregister(ctx);
				9196	else
				9197	io_rsrc_node_switch(ctx, NULL);
				9198	return ret;
				9199	}
				9200
				9201	static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
				9202	struct io_uring_rsrc_update2 *up,
				9203	unsigned int nr_args)
				9204	{
				9205	u64 __user *tags = u64_to_user_ptr(up->tags);
				9206	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
				9207	struct page *last_hpage = NULL;
				9208	bool needs_switch = false;
				9209	__u32 done;
				9210	int i, err;
				9211
				9212	if (!ctx->buf_data)
				9213	return -ENXIO;
				9214	if (up->offset + nr_args > ctx->nr_user_bufs)
				9215	return -EINVAL;
				9216
				9217	for (done = 0; done < nr_args; done++) {
				9218	struct io_mapped_ubuf *imu;
				9219	int offset = up->offset + done;
				9220	u64 tag = 0;
				9221
				9222	err = io_copy_iov(ctx, &iov, iovs, done);
				9223	if (err)
				9224	break;
				9225	if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
				9226	err = -EFAULT;
				9227	break;
				9228	}
				9229	err = io_buffer_validate(&iov);
				9230	if (err)
				9231	break;
				9232	if (!iov.iov_base && tag) {
				9233	err = -EINVAL;
				9234	break;
				9235	}
				9236	err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
				9237	if (err)
				9238	break;
				9239
				9240	i = array_index_nospec(offset, ctx->nr_user_bufs);
				9241	if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
				9242	err = io_queue_rsrc_removal(ctx->buf_data, i,
				9243	ctx->rsrc_node, ctx->user_bufs[i]);
				9244	if (unlikely(err)) {
				9245	io_buffer_unmap(ctx, &imu);
				9246	break;
				9247	}
				9248	ctx->user_bufs[i] = NULL;
				9249	needs_switch = true;
				9250	}
				9251
				9252	ctx->user_bufs[i] = imu;
				9253	*io_get_tag_slot(ctx->buf_data, offset) = tag;
				9254	}
				9255
				9256	if (needs_switch)
				9257	io_rsrc_node_switch(ctx, ctx->buf_data);
				9258	return done ? done : err;
				9259	}
				9260
				9261	static int io_eventfd_register(struct io_ring_ctx ctx, void __user arg)
				9262	{
				9263	__s32 __user *fds = arg;
				9264	int fd;
				9265
				9266	if (ctx->cq_ev_fd)
				9267	return -EBUSY;
				9268
				9269	if (copy_from_user(&fd, fds, sizeof(*fds)))
				9270	return -EFAULT;
				9271
				9272	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
				9273	if (IS_ERR(ctx->cq_ev_fd)) {
				9274	int ret = PTR_ERR(ctx->cq_ev_fd);
				9275
				9276	ctx->cq_ev_fd = NULL;
				9277	return ret;
				9278	}
				9279
				9280	return 0;
				9281	}
				9282
				9283	static int io_eventfd_unregister(struct io_ring_ctx *ctx)
				9284	{
				9285	if (ctx->cq_ev_fd) {
				9286	eventfd_ctx_put(ctx->cq_ev_fd);
				9287	ctx->cq_ev_fd = NULL;
				9288	return 0;
				9289	}
				9290
				9291	return -ENXIO;
				9292	}
				9293
				9294	static void io_destroy_buffers(struct io_ring_ctx *ctx)
				9295	{
				9296	struct io_buffer *buf;
				9297	unsigned long index;
				9298
				9299	xa_for_each(&ctx->io_buffers, index, buf)
				9300	__io_remove_buffers(ctx, buf, index, -1U);
				9301	}
				9302
				9303	static void io_req_cache_free(struct list_head *list)
				9304	{
				9305	struct io_kiocb req, nxt;
				9306
				9307	list_for_each_entry_safe(req, nxt, list, inflight_entry) {
				9308	list_del(&req->inflight_entry);
				9309	kmem_cache_free(req_cachep, req);
				9310	}
				9311	}
				9312
				9313	static void io_req_caches_free(struct io_ring_ctx *ctx)
				9314	{
				9315	struct io_submit_state *state = &ctx->submit_state;
				9316
				9317	mutex_lock(&ctx->uring_lock);
				9318
				9319	if (state->free_reqs) {
				9320	kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
				9321	state->free_reqs = 0;
				9322	}
				9323
				9324	io_flush_cached_locked_reqs(ctx, state);
				9325	io_req_cache_free(&state->free_list);
				9326	mutex_unlock(&ctx->uring_lock);
				9327	}
				9328
				9329	static void io_wait_rsrc_data(struct io_rsrc_data *data)
				9330	{
				9331	if (data && !atomic_dec_and_test(&data->refs))
				9332	wait_for_completion(&data->done);
				9333	}
				9334
				9335	static void io_ring_ctx_free(struct io_ring_ctx *ctx)
				9336	{
				9337	io_sq_thread_finish(ctx);
				9338
				9339	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
				9340	io_wait_rsrc_data(ctx->buf_data);
				9341	io_wait_rsrc_data(ctx->file_data);
				9342
				9343	mutex_lock(&ctx->uring_lock);
				9344	if (ctx->buf_data)
				9345	__io_sqe_buffers_unregister(ctx);
				9346	if (ctx->file_data)
				9347	__io_sqe_files_unregister(ctx);
				9348	if (ctx->rings)
				9349	__io_cqring_overflow_flush(ctx, true);
				9350	mutex_unlock(&ctx->uring_lock);
				9351	io_eventfd_unregister(ctx);
				9352	io_destroy_buffers(ctx);
				9353	if (ctx->sq_creds)
				9354	put_cred(ctx->sq_creds);
				9355
				9356	/* there are no registered resources left, nobody uses it */
				9357	if (ctx->rsrc_node)
				9358	io_rsrc_node_destroy(ctx->rsrc_node);
				9359	if (ctx->rsrc_backup_node)
				9360	io_rsrc_node_destroy(ctx->rsrc_backup_node);
				9361	flush_delayed_work(&ctx->rsrc_put_work);
				9362
				9363	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
				9364	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
				9365
				9366	#if defined(CONFIG_UNIX)
				9367	if (ctx->ring_sock) {
				9368	ctx->ring_sock->file = NULL; /* so that iput() is called */
				9369	sock_release(ctx->ring_sock);
				9370	}
				9371	#endif
				9372	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
				9373
				9374	if (ctx->mm_account) {
				9375	mmdrop(ctx->mm_account);
				9376	ctx->mm_account = NULL;
				9377	}
				9378
				9379	io_mem_free(ctx->rings);
				9380	io_mem_free(ctx->sq_sqes);
				9381
				9382	percpu_ref_exit(&ctx->refs);
				9383	free_uid(ctx->user);
				9384	io_req_caches_free(ctx);
				9385	if (ctx->hash_map)
				9386	io_wq_put_hash(ctx->hash_map);
				9387	kfree(ctx->cancel_hash);
				9388	kfree(ctx->dummy_ubuf);
				9389	kfree(ctx);
				9390	}
				9391
				9392	static __poll_t io_uring_poll(struct file file, poll_table wait)
				9393	{
				9394	struct io_ring_ctx *ctx = file->private_data;
				9395	__poll_t mask = 0;
				9396
				9397	poll_wait(file, &ctx->poll_wait, wait);
				9398	/*
				9399	* synchronizes with barrier from wq_has_sleeper call in
				9400	* io_commit_cqring
				9401	*/
				9402	smp_rmb();
				9403	if (!io_sqring_full(ctx))
				9404	mask \|= EPOLLOUT \| EPOLLWRNORM;
				9405
				9406	/*
				9407	* Don't flush cqring overflow list here, just do a simple check.
				9408	* Otherwise there could possible be ABBA deadlock:
				9409	* CPU0 CPU1
				9410	* ---- ----
				9411	* lock(&ctx->uring_lock);
				9412	* lock(&ep->mtx);
				9413	* lock(&ctx->uring_lock);
				9414	* lock(&ep->mtx);
				9415	*
				9416	* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
				9417	* pushs them to do the flush.
				9418	*/
				9419	if (io_cqring_events(ctx) \|\| test_bit(0, &ctx->check_cq_overflow))
				9420	mask \|= EPOLLIN \| EPOLLRDNORM;
				9421
				9422	return mask;
				9423	}
				9424
				9425	static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
				9426	{
				9427	const struct cred *creds;
				9428
				9429	creds = xa_erase(&ctx->personalities, id);
				9430	if (creds) {
				9431	put_cred(creds);
				9432	return 0;
				9433	}
				9434
				9435	return -EINVAL;
				9436	}
				9437
				9438	struct io_tctx_exit {
				9439	struct callback_head task_work;
				9440	struct completion completion;
				9441	struct io_ring_ctx *ctx;
				9442	};
				9443
				9444	static void io_tctx_exit_cb(struct callback_head *cb)
				9445	{
				9446	struct io_uring_task *tctx = current->io_uring;
				9447	struct io_tctx_exit *work;
				9448
				9449	work = container_of(cb, struct io_tctx_exit, task_work);
				9450	/*
				9451	* When @in_idle, we're in cancellation and it's racy to remove the
				9452	* node. It'll be removed by the end of cancellation, just ignore it.
				9453	* tctx can be NULL if the queueing of this task_work raced with
				9454	* work cancelation off the exec path.
				9455	*/
				9456	if (tctx && !atomic_read(&tctx->in_idle))
				9457	io_uring_del_tctx_node((unsigned long)work->ctx);
				9458	complete(&work->completion);
				9459	}
				9460
				9461	static bool io_cancel_ctx_cb(struct io_wq_work work, void data)
				9462	{
				9463	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				9464
				9465	return req->ctx == data;
				9466	}
				9467
				9468	static void io_ring_exit_work(struct work_struct *work)
				9469	{
				9470	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
				9471	unsigned long timeout = jiffies + HZ * 60 * 5;
				9472	unsigned long interval = HZ / 20;
				9473	struct io_tctx_exit exit;
				9474	struct io_tctx_node *node;
				9475	int ret;
				9476
				9477	/*
				9478	* If we're doing polled IO and end up having requests being
				9479	* submitted async (out-of-line), then completions can come in while
				9480	* we're waiting for refs to drop. We need to reap these manually,
				9481	* as nobody else will be looking for them.
				9482	*/
				9483	do {
				9484	io_uring_try_cancel_requests(ctx, NULL, true);
				9485	if (ctx->sq_data) {
				9486	struct io_sq_data *sqd = ctx->sq_data;
				9487	struct task_struct *tsk;
				9488
				9489	io_sq_thread_park(sqd);
				9490	tsk = sqd->thread;
				9491	if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
				9492	io_wq_cancel_cb(tsk->io_uring->io_wq,
				9493	io_cancel_ctx_cb, ctx, true);
				9494	io_sq_thread_unpark(sqd);
				9495	}
				9496
				9497	if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
				9498	/* there is little hope left, don't run it too often */
				9499	interval = HZ * 60;
				9500	}
				9501	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
				9502
				9503	init_completion(&exit.completion);
				9504	init_task_work(&exit.task_work, io_tctx_exit_cb);
				9505	exit.ctx = ctx;
				9506	/*
				9507	* Some may use context even when all refs and requests have been put,
				9508	* and they are free to do so while still holding uring_lock or
				9509	* completion_lock, see io_req_task_submit(). Apart from other work,
				9510	* this lock/unlock section also waits them to finish.
				9511	*/
				9512	mutex_lock(&ctx->uring_lock);
				9513	while (!list_empty(&ctx->tctx_list)) {
				9514	WARN_ON_ONCE(time_after(jiffies, timeout));
				9515
				9516	node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
				9517	ctx_node);
				9518	/* don't spin on a single task if cancellation failed */
				9519	list_rotate_left(&ctx->tctx_list);
				9520	ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
				9521	if (WARN_ON_ONCE(ret))
				9522	continue;
				9523	wake_up_process(node->task);
				9524
				9525	mutex_unlock(&ctx->uring_lock);
				9526	wait_for_completion(&exit.completion);
				9527	mutex_lock(&ctx->uring_lock);
				9528	}
				9529	mutex_unlock(&ctx->uring_lock);
				9530	spin_lock(&ctx->completion_lock);
				9531	spin_unlock(&ctx->completion_lock);
				9532
				9533	io_ring_ctx_free(ctx);
				9534	}
				9535
				9536	/* Returns true if we found and killed one or more timeouts */
				9537	static bool io_kill_timeouts(struct io_ring_ctx ctx, struct task_struct tsk,
				9538	bool cancel_all)
				9539	{
				9540	struct io_kiocb req, tmp;
				9541	int canceled = 0;
				9542
				9543	spin_lock(&ctx->completion_lock);
				9544	spin_lock_irq(&ctx->timeout_lock);
				9545	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
				9546	if (io_match_task(req, tsk, cancel_all)) {
				9547	io_kill_timeout(req, -ECANCELED);
				9548	canceled++;
				9549	}
				9550	}
				9551	spin_unlock_irq(&ctx->timeout_lock);
				9552	if (canceled != 0)
				9553	io_commit_cqring(ctx);
				9554	spin_unlock(&ctx->completion_lock);
				9555	if (canceled != 0)
				9556	io_cqring_ev_posted(ctx);
				9557	return canceled != 0;
				9558	}
				9559
				9560	static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
				9561	{
				9562	unsigned long index;
				9563	struct creds *creds;
				9564
				9565	mutex_lock(&ctx->uring_lock);
				9566	percpu_ref_kill(&ctx->refs);
				9567	if (ctx->rings)
				9568	__io_cqring_overflow_flush(ctx, true);
				9569	xa_for_each(&ctx->personalities, index, creds)
				9570	io_unregister_personality(ctx, index);
				9571	mutex_unlock(&ctx->uring_lock);
				9572
				9573	io_kill_timeouts(ctx, NULL, true);
				9574	io_poll_remove_all(ctx, NULL, true);
				9575
				9576	/* if we failed setting up the ctx, we might not have any rings */
				9577	io_iopoll_try_reap_events(ctx);
				9578
Jens Axboe	2e50949	2023-01-21 12:36:08 -0700	[diff] [blame]	9579	/* drop cached put refs after potentially doing completions */
				9580	if (current->io_uring)
				9581	io_uring_drop_tctx_refs(current);
				9582
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	9583	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
				9584	/*
				9585	* Use system_unbound_wq to avoid spawning tons of event kworkers
				9586	* if we're exiting a ton of rings at the same time. It just adds
				9587	* noise and overhead, there's no discernable change in runtime
				9588	* over using system_wq.
				9589	*/
				9590	queue_work(system_unbound_wq, &ctx->exit_work);
				9591	}
				9592
				9593	static int io_uring_release(struct inode inode, struct file file)
				9594	{
				9595	struct io_ring_ctx *ctx = file->private_data;
				9596
				9597	file->private_data = NULL;
				9598	io_ring_ctx_wait_and_kill(ctx);
				9599	return 0;
				9600	}
				9601
				9602	struct io_task_cancel {
				9603	struct task_struct *task;
				9604	bool all;
				9605	};
				9606
				9607	static bool io_cancel_task_cb(struct io_wq_work work, void data)
				9608	{
				9609	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
				9610	struct io_task_cancel *cancel = data;
				9611
				9612	return io_match_task_safe(req, cancel->task, cancel->all);
				9613	}
				9614
				9615	static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
				9616	struct task_struct *task, bool cancel_all)
				9617	{
				9618	struct io_defer_entry *de;
				9619	LIST_HEAD(list);
				9620
				9621	spin_lock(&ctx->completion_lock);
				9622	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
				9623	if (io_match_task_safe(de->req, task, cancel_all)) {
				9624	list_cut_position(&list, &ctx->defer_list, &de->list);
				9625	break;
				9626	}
				9627	}
				9628	spin_unlock(&ctx->completion_lock);
				9629	if (list_empty(&list))
				9630	return false;
				9631
				9632	while (!list_empty(&list)) {
				9633	de = list_first_entry(&list, struct io_defer_entry, list);
				9634	list_del_init(&de->list);
				9635	io_req_complete_failed(de->req, -ECANCELED);
				9636	kfree(de);
				9637	}
				9638	return true;
				9639	}
				9640
				9641	static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
				9642	{
				9643	struct io_tctx_node *node;
				9644	enum io_wq_cancel cret;
				9645	bool ret = false;
				9646
				9647	mutex_lock(&ctx->uring_lock);
				9648	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
				9649	struct io_uring_task *tctx = node->task->io_uring;
				9650
				9651	/*
				9652	* io_wq will stay alive while we hold uring_lock, because it's
				9653	* killed after ctx nodes, which requires to take the lock.
				9654	*/
				9655	if (!tctx \|\| !tctx->io_wq)
				9656	continue;
				9657	cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
				9658	ret \|= (cret != IO_WQ_CANCEL_NOTFOUND);
				9659	}
				9660	mutex_unlock(&ctx->uring_lock);
				9661
				9662	return ret;
				9663	}
				9664
				9665	static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
				9666	struct task_struct *task,
				9667	bool cancel_all)
				9668	{
				9669	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
				9670	struct io_uring_task *tctx = task ? task->io_uring : NULL;
				9671
				9672	while (1) {
				9673	enum io_wq_cancel cret;
				9674	bool ret = false;
				9675
				9676	if (!task) {
				9677	ret \|= io_uring_try_cancel_iowq(ctx);
				9678	} else if (tctx && tctx->io_wq) {
				9679	/*
				9680	* Cancels requests of all rings, not only @ctx, but
				9681	* it's fine as the task is in exit/exec.
				9682	*/
				9683	cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
				9684	&cancel, true);
				9685	ret \|= (cret != IO_WQ_CANCEL_NOTFOUND);
				9686	}
				9687
				9688	/* SQPOLL thread does its own polling */
				9689	if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) \|\|
				9690	(ctx->sq_data && ctx->sq_data->thread == current)) {
				9691	while (!list_empty_careful(&ctx->iopoll_list)) {
				9692	io_iopoll_try_reap_events(ctx);
				9693	ret = true;
				9694	}
				9695	}
				9696
				9697	ret \|= io_cancel_defer_files(ctx, task, cancel_all);
				9698	ret \|= io_poll_remove_all(ctx, task, cancel_all);
				9699	ret \|= io_kill_timeouts(ctx, task, cancel_all);
				9700	if (task)
				9701	ret \|= io_run_task_work();
				9702	if (!ret)
				9703	break;
				9704	cond_resched();
				9705	}
				9706	}
				9707
				9708	static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
				9709	{
				9710	struct io_uring_task *tctx = current->io_uring;
				9711	struct io_tctx_node *node;
				9712	int ret;
				9713
				9714	if (unlikely(!tctx)) {
				9715	ret = io_uring_alloc_task_context(current, ctx);
				9716	if (unlikely(ret))
				9717	return ret;
				9718
				9719	tctx = current->io_uring;
				9720	if (ctx->iowq_limits_set) {
				9721	unsigned int limits[2] = { ctx->iowq_limits[0],
				9722	ctx->iowq_limits[1], };
				9723
				9724	ret = io_wq_max_workers(tctx->io_wq, limits);
				9725	if (ret)
				9726	return ret;
				9727	}
				9728	}
				9729	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
				9730	node = kmalloc(sizeof(*node), GFP_KERNEL);
				9731	if (!node)
				9732	return -ENOMEM;
				9733	node->ctx = ctx;
				9734	node->task = current;
				9735
				9736	ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
				9737	node, GFP_KERNEL));
				9738	if (ret) {
				9739	kfree(node);
				9740	return ret;
				9741	}
				9742
				9743	mutex_lock(&ctx->uring_lock);
				9744	list_add(&node->ctx_node, &ctx->tctx_list);
				9745	mutex_unlock(&ctx->uring_lock);
				9746	}
				9747	tctx->last = ctx;
				9748	return 0;
				9749	}
				9750
				9751	/*
				9752	* Note that this task has used io_uring. We use it for cancelation purposes.
				9753	*/
				9754	static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
				9755	{
				9756	struct io_uring_task *tctx = current->io_uring;
				9757
				9758	if (likely(tctx && tctx->last == ctx))
				9759	return 0;
				9760	return __io_uring_add_tctx_node(ctx);
				9761	}
				9762
				9763	/*
				9764	* Remove this io_uring_file -> task mapping.
				9765	*/
				9766	static void io_uring_del_tctx_node(unsigned long index)
				9767	{
				9768	struct io_uring_task *tctx = current->io_uring;
				9769	struct io_tctx_node *node;
				9770
				9771	if (!tctx)
				9772	return;
				9773	node = xa_erase(&tctx->xa, index);
				9774	if (!node)
				9775	return;
				9776
				9777	WARN_ON_ONCE(current != node->task);
				9778	WARN_ON_ONCE(list_empty(&node->ctx_node));
				9779
				9780	mutex_lock(&node->ctx->uring_lock);
				9781	list_del(&node->ctx_node);
				9782	mutex_unlock(&node->ctx->uring_lock);
				9783
				9784	if (tctx->last == node->ctx)
				9785	tctx->last = NULL;
				9786	kfree(node);
				9787	}
				9788
				9789	static void io_uring_clean_tctx(struct io_uring_task *tctx)
				9790	{
				9791	struct io_wq *wq = tctx->io_wq;
				9792	struct io_tctx_node *node;
				9793	unsigned long index;
				9794
				9795	xa_for_each(&tctx->xa, index, node) {
				9796	io_uring_del_tctx_node(index);
				9797	cond_resched();
				9798	}
				9799	if (wq) {
				9800	/*
				9801	* Must be after io_uring_del_task_file() (removes nodes under
				9802	* uring_lock) to avoid race with io_uring_try_cancel_iowq().
				9803	*/
				9804	io_wq_put_and_exit(wq);
				9805	tctx->io_wq = NULL;
				9806	}
				9807	}
				9808
				9809	static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
				9810	{
				9811	if (tracked)
				9812	return atomic_read(&tctx->inflight_tracked);
				9813	return percpu_counter_sum(&tctx->inflight);
				9814	}
				9815
				9816	/*
				9817	* Find any io_uring ctx that this task has registered or done IO on, and cancel
				9818	* requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
				9819	*/
				9820	static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
				9821	{
				9822	struct io_uring_task *tctx = current->io_uring;
				9823	struct io_ring_ctx *ctx;
				9824	s64 inflight;
				9825	DEFINE_WAIT(wait);
				9826
				9827	WARN_ON_ONCE(sqd && sqd->thread != current);
				9828
				9829	if (!current->io_uring)
				9830	return;
				9831	if (tctx->io_wq)
				9832	io_wq_exit_start(tctx->io_wq);
				9833
				9834	atomic_inc(&tctx->in_idle);
				9835	do {
				9836	io_uring_drop_tctx_refs(current);
				9837	/* read completions before cancelations */
				9838	inflight = tctx_inflight(tctx, !cancel_all);
				9839	if (!inflight)
				9840	break;
				9841
				9842	if (!sqd) {
				9843	struct io_tctx_node *node;
				9844	unsigned long index;
				9845
				9846	xa_for_each(&tctx->xa, index, node) {
				9847	/* sqpoll task will cancel all its requests */
				9848	if (node->ctx->sq_data)
				9849	continue;
				9850	io_uring_try_cancel_requests(node->ctx, current,
				9851	cancel_all);
				9852	}
				9853	} else {
				9854	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
				9855	io_uring_try_cancel_requests(ctx, current,
				9856	cancel_all);
				9857	}
				9858
				9859	prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
				9860	io_run_task_work();
				9861	io_uring_drop_tctx_refs(current);
				9862
				9863	/*
				9864	* If we've seen completions, retry without waiting. This
				9865	* avoids a race where a completion comes in before we did
				9866	* prepare_to_wait().
				9867	*/
				9868	if (inflight == tctx_inflight(tctx, !cancel_all))
				9869	schedule();
				9870	finish_wait(&tctx->wait, &wait);
				9871	} while (1);
				9872
				9873	io_uring_clean_tctx(tctx);
				9874	if (cancel_all) {
				9875	/*
				9876	* We shouldn't run task_works after cancel, so just leave
				9877	* ->in_idle set for normal exit.
				9878	*/
				9879	atomic_dec(&tctx->in_idle);
				9880	/* for exec all current's requests should be gone, kill tctx */
				9881	__io_uring_free(current);
				9882	}
				9883	}
				9884
				9885	void __io_uring_cancel(bool cancel_all)
				9886	{
				9887	io_uring_cancel_generic(cancel_all, NULL);
				9888	}
				9889
				9890	static void io_uring_validate_mmap_request(struct file file,
				9891	loff_t pgoff, size_t sz)
				9892	{
				9893	struct io_ring_ctx *ctx = file->private_data;
				9894	loff_t offset = pgoff << PAGE_SHIFT;
				9895	struct page *page;
				9896	void *ptr;
				9897
				9898	switch (offset) {
				9899	case IORING_OFF_SQ_RING:
				9900	case IORING_OFF_CQ_RING:
				9901	ptr = ctx->rings;
				9902	break;
				9903	case IORING_OFF_SQES:
				9904	ptr = ctx->sq_sqes;
				9905	break;
				9906	default:
				9907	return ERR_PTR(-EINVAL);
				9908	}
				9909
				9910	page = virt_to_head_page(ptr);
				9911	if (sz > page_size(page))
				9912	return ERR_PTR(-EINVAL);
				9913
				9914	return ptr;
				9915	}
				9916
				9917	#ifdef CONFIG_MMU
				9918
				9919	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				9920	{
				9921	size_t sz = vma->vm_end - vma->vm_start;
				9922	unsigned long pfn;
				9923	void *ptr;
				9924
				9925	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
				9926	if (IS_ERR(ptr))
				9927	return PTR_ERR(ptr);
				9928
				9929	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
				9930	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
				9931	}
				9932
				9933	#else /* !CONFIG_MMU */
				9934
				9935	static int io_uring_mmap(struct file file, struct vm_area_struct vma)
				9936	{
				9937	return vma->vm_flags & (VM_SHARED \| VM_MAYSHARE) ? 0 : -EINVAL;
				9938	}
				9939
				9940	static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
				9941	{
				9942	return NOMMU_MAP_DIRECT \| NOMMU_MAP_READ \| NOMMU_MAP_WRITE;
				9943	}
				9944
				9945	static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
				9946	unsigned long addr, unsigned long len,
				9947	unsigned long pgoff, unsigned long flags)
				9948	{
				9949	void *ptr;
				9950
				9951	ptr = io_uring_validate_mmap_request(file, pgoff, len);
				9952	if (IS_ERR(ptr))
				9953	return PTR_ERR(ptr);
				9954
				9955	return (unsigned long) ptr;
				9956	}
				9957
				9958	#endif /* !CONFIG_MMU */
				9959
				9960	static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
				9961	{
				9962	DEFINE_WAIT(wait);
				9963
				9964	do {
				9965	if (!io_sqring_full(ctx))
				9966	break;
				9967	prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
				9968
				9969	if (!io_sqring_full(ctx))
				9970	break;
				9971	schedule();
				9972	} while (!signal_pending(current));
				9973
				9974	finish_wait(&ctx->sqo_sq_wait, &wait);
				9975	return 0;
				9976	}
				9977
				9978	static int io_get_ext_arg(unsigned flags, const void __user argp, size_t argsz,
				9979	struct __kernel_timespec __user **ts,
				9980	const sigset_t __user **sig)
				9981	{
				9982	struct io_uring_getevents_arg arg;
				9983
				9984	/*
				9985	* If EXT_ARG isn't set, then we have no timespec and the argp pointer
				9986	* is just a pointer to the sigset_t.
				9987	*/
				9988	if (!(flags & IORING_ENTER_EXT_ARG)) {
				9989	sig = (const sigset_t __user ) argp;
				9990	*ts = NULL;
				9991	return 0;
				9992	}
				9993
				9994	/*
				9995	* EXT_ARG is set - ensure we agree on the size of it and copy in our
				9996	* timespec and sigset_t pointers if good.
				9997	*/
				9998	if (*argsz != sizeof(arg))
				9999	return -EINVAL;
				10000	if (copy_from_user(&arg, argp, sizeof(arg)))
				10001	return -EFAULT;
				10002	if (arg.pad)
				10003	return -EINVAL;
				10004	*sig = u64_to_user_ptr(arg.sigmask);
				10005	*argsz = arg.sigmask_sz;
				10006	*ts = u64_to_user_ptr(arg.ts);
				10007	return 0;
				10008	}
				10009
				10010	SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
				10011	u32, min_complete, u32, flags, const void __user *, argp,
				10012	size_t, argsz)
				10013	{
				10014	struct io_ring_ctx *ctx;
				10015	int submitted = 0;
				10016	struct fd f;
				10017	long ret;
				10018
				10019	io_run_task_work();
				10020
				10021	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS \| IORING_ENTER_SQ_WAKEUP \|
				10022	IORING_ENTER_SQ_WAIT \| IORING_ENTER_EXT_ARG)))
				10023	return -EINVAL;
				10024
				10025	f = fdget(fd);
				10026	if (unlikely(!f.file))
				10027	return -EBADF;
				10028
				10029	ret = -EOPNOTSUPP;
				10030	if (unlikely(f.file->f_op != &io_uring_fops))
				10031	goto out_fput;
				10032
				10033	ret = -ENXIO;
				10034	ctx = f.file->private_data;
				10035	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
				10036	goto out_fput;
				10037
				10038	ret = -EBADFD;
				10039	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
				10040	goto out;
				10041
				10042	/*
				10043	* For SQ polling, the thread will do all submissions and completions.
				10044	* Just return the requested submit count, and wake the thread if
				10045	* we were asked to.
				10046	*/
				10047	ret = 0;
				10048	if (ctx->flags & IORING_SETUP_SQPOLL) {
				10049	io_cqring_overflow_flush(ctx);
				10050
				10051	if (unlikely(ctx->sq_data->thread == NULL)) {
				10052	ret = -EOWNERDEAD;
				10053	goto out;
				10054	}
				10055	if (flags & IORING_ENTER_SQ_WAKEUP)
				10056	wake_up(&ctx->sq_data->wait);
				10057	if (flags & IORING_ENTER_SQ_WAIT) {
				10058	ret = io_sqpoll_wait_sq(ctx);
				10059	if (ret)
				10060	goto out;
				10061	}
				10062	submitted = to_submit;
				10063	} else if (to_submit) {
				10064	ret = io_uring_add_tctx_node(ctx);
				10065	if (unlikely(ret))
				10066	goto out;
				10067	mutex_lock(&ctx->uring_lock);
				10068	submitted = io_submit_sqes(ctx, to_submit);
				10069	mutex_unlock(&ctx->uring_lock);
				10070
				10071	if (submitted != to_submit)
				10072	goto out;
				10073	}
				10074	if (flags & IORING_ENTER_GETEVENTS) {
				10075	const sigset_t __user *sig;
				10076	struct __kernel_timespec __user *ts;
				10077
				10078	ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
				10079	if (unlikely(ret))
				10080	goto out;
				10081
				10082	min_complete = min(min_complete, ctx->cq_entries);
				10083
				10084	/*
				10085	* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
				10086	* space applications don't need to do io completion events
				10087	* polling again, they can rely on io_sq_thread to do polling
				10088	* work, which can reduce cpu usage and uring_lock contention.
				10089	*/
				10090	if (ctx->flags & IORING_SETUP_IOPOLL &&
				10091	!(ctx->flags & IORING_SETUP_SQPOLL)) {
				10092	ret = io_iopoll_check(ctx, min_complete);
				10093	} else {
				10094	ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
				10095	}
				10096	}
				10097
				10098	out:
				10099	percpu_ref_put(&ctx->refs);
				10100	out_fput:
				10101	fdput(f);
				10102	return submitted ? submitted : ret;
				10103	}
				10104
				10105	#ifdef CONFIG_PROC_FS
				10106	static int io_uring_show_cred(struct seq_file *m, unsigned int id,
				10107	const struct cred *cred)
				10108	{
				10109	struct user_namespace *uns = seq_user_ns(m);
				10110	struct group_info *gi;
				10111	kernel_cap_t cap;
				10112	unsigned __capi;
				10113	int g;
				10114
				10115	seq_printf(m, "%5d\n", id);
				10116	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
				10117	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
				10118	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
				10119	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
				10120	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
				10121	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
				10122	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
				10123	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
				10124	seq_puts(m, "\n\tGroups:\t");
				10125	gi = cred->group_info;
				10126	for (g = 0; g < gi->ngroups; g++) {
				10127	seq_put_decimal_ull(m, g ? " " : "",
				10128	from_kgid_munged(uns, gi->gid[g]));
				10129	}
				10130	seq_puts(m, "\n\tCapEff:\t");
				10131	cap = cred->cap_effective;
				10132	CAP_FOR_EACH_U32(__capi)
				10133	seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
				10134	seq_putc(m, '\n');
				10135	return 0;
				10136	}
				10137
				10138	static void __io_uring_show_fdinfo(struct io_ring_ctx ctx, struct seq_file m)
				10139	{
				10140	struct io_sq_data *sq = NULL;
				10141	bool has_lock;
				10142	int i;
				10143
				10144	/*
				10145	* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
				10146	* since fdinfo case grabs it in the opposite direction of normal use
				10147	* cases. If we fail to get the lock, we just don't iterate any
				10148	* structures that could be going away outside the io_uring mutex.
				10149	*/
				10150	has_lock = mutex_trylock(&ctx->uring_lock);
				10151
				10152	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
				10153	sq = ctx->sq_data;
				10154	if (!sq->thread)
				10155	sq = NULL;
				10156	}
				10157
				10158	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
				10159	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
				10160	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
				10161	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
				10162	struct file *f = io_file_from_index(ctx, i);
				10163
				10164	if (f)
				10165	seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
				10166	else
				10167	seq_printf(m, "%5u: <none>\n", i);
				10168	}
				10169	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
				10170	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
				10171	struct io_mapped_ubuf *buf = ctx->user_bufs[i];
				10172	unsigned int len = buf->ubuf_end - buf->ubuf;
				10173
				10174	seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
				10175	}
				10176	if (has_lock && !xa_empty(&ctx->personalities)) {
				10177	unsigned long index;
				10178	const struct cred *cred;
				10179
				10180	seq_printf(m, "Personalities:\n");
				10181	xa_for_each(&ctx->personalities, index, cred)
				10182	io_uring_show_cred(m, index, cred);
				10183	}
				10184	seq_printf(m, "PollList:\n");
				10185	spin_lock(&ctx->completion_lock);
				10186	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
				10187	struct hlist_head *list = &ctx->cancel_hash[i];
				10188	struct io_kiocb *req;
				10189
				10190	hlist_for_each_entry(req, list, hash_node)
				10191	seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
				10192	req->task->task_works != NULL);
				10193	}
				10194	spin_unlock(&ctx->completion_lock);
				10195	if (has_lock)
				10196	mutex_unlock(&ctx->uring_lock);
				10197	}
				10198
				10199	static void io_uring_show_fdinfo(struct seq_file m, struct file f)
				10200	{
				10201	struct io_ring_ctx *ctx = f->private_data;
				10202
				10203	if (percpu_ref_tryget(&ctx->refs)) {
				10204	__io_uring_show_fdinfo(ctx, m);
				10205	percpu_ref_put(&ctx->refs);
				10206	}
				10207	}
				10208	#endif
				10209
				10210	static const struct file_operations io_uring_fops = {
				10211	.release = io_uring_release,
				10212	.mmap = io_uring_mmap,
				10213	#ifndef CONFIG_MMU
				10214	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
				10215	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
				10216	#endif
				10217	.poll = io_uring_poll,
				10218	#ifdef CONFIG_PROC_FS
				10219	.show_fdinfo = io_uring_show_fdinfo,
				10220	#endif
				10221	};
				10222
				10223	static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
				10224	struct io_uring_params *p)
				10225	{
				10226	struct io_rings *rings;
				10227	size_t size, sq_array_offset;
				10228
				10229	/* make sure these are sane, as we already accounted them */
				10230	ctx->sq_entries = p->sq_entries;
				10231	ctx->cq_entries = p->cq_entries;
				10232
				10233	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
				10234	if (size == SIZE_MAX)
				10235	return -EOVERFLOW;
				10236
				10237	rings = io_mem_alloc(size);
				10238	if (!rings)
				10239	return -ENOMEM;
				10240
				10241	ctx->rings = rings;
				10242	ctx->sq_array = (u32 )((char )rings + sq_array_offset);
				10243	rings->sq_ring_mask = p->sq_entries - 1;
				10244	rings->cq_ring_mask = p->cq_entries - 1;
				10245	rings->sq_ring_entries = p->sq_entries;
				10246	rings->cq_ring_entries = p->cq_entries;
				10247
				10248	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
				10249	if (size == SIZE_MAX) {
				10250	io_mem_free(ctx->rings);
				10251	ctx->rings = NULL;
				10252	return -EOVERFLOW;
				10253	}
				10254
				10255	ctx->sq_sqes = io_mem_alloc(size);
				10256	if (!ctx->sq_sqes) {
				10257	io_mem_free(ctx->rings);
				10258	ctx->rings = NULL;
				10259	return -ENOMEM;
				10260	}
				10261
				10262	return 0;
				10263	}
				10264
				10265	static int io_uring_install_fd(struct io_ring_ctx ctx, struct file file)
				10266	{
				10267	int ret, fd;
				10268
				10269	fd = get_unused_fd_flags(O_RDWR \| O_CLOEXEC);
				10270	if (fd < 0)
				10271	return fd;
				10272
				10273	ret = io_uring_add_tctx_node(ctx);
				10274	if (ret) {
				10275	put_unused_fd(fd);
				10276	return ret;
				10277	}
				10278	fd_install(fd, file);
				10279	return fd;
				10280	}
				10281
				10282	/*
				10283	* Allocate an anonymous fd, this is what constitutes the application
				10284	* visible backing of an io_uring instance. The application mmaps this
				10285	* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
				10286	* we have to tie this fd to a socket for file garbage collection purposes.
				10287	*/
				10288	static struct file io_uring_get_file(struct io_ring_ctx ctx)
				10289	{
				10290	struct file *file;
				10291	#if defined(CONFIG_UNIX)
				10292	int ret;
				10293
				10294	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
				10295	&ctx->ring_sock);
				10296	if (ret)
				10297	return ERR_PTR(ret);
				10298	#endif
				10299
				10300	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
				10301	O_RDWR \| O_CLOEXEC);
				10302	#if defined(CONFIG_UNIX)
				10303	if (IS_ERR(file)) {
				10304	sock_release(ctx->ring_sock);
				10305	ctx->ring_sock = NULL;
				10306	} else {
				10307	ctx->ring_sock->file = file;
				10308	}
				10309	#endif
				10310	return file;
				10311	}
				10312
				10313	static int io_uring_create(unsigned entries, struct io_uring_params *p,
				10314	struct io_uring_params __user *params)
				10315	{
				10316	struct io_ring_ctx *ctx;
				10317	struct file *file;
				10318	int ret;
				10319
				10320	if (!entries)
				10321	return -EINVAL;
				10322	if (entries > IORING_MAX_ENTRIES) {
				10323	if (!(p->flags & IORING_SETUP_CLAMP))
				10324	return -EINVAL;
				10325	entries = IORING_MAX_ENTRIES;
				10326	}
				10327
				10328	/*
				10329	* Use twice as many entries for the CQ ring. It's possible for the
				10330	* application to drive a higher depth than the size of the SQ ring,
				10331	* since the sqes are only used at submission time. This allows for
				10332	* some flexibility in overcommitting a bit. If the application has
				10333	* set IORING_SETUP_CQSIZE, it will have passed in the desired number
				10334	* of CQ ring entries manually.
				10335	*/
				10336	p->sq_entries = roundup_pow_of_two(entries);
				10337	if (p->flags & IORING_SETUP_CQSIZE) {
				10338	/*
				10339	* If IORING_SETUP_CQSIZE is set, we do the same roundup
				10340	* to a power-of-two, if it isn't already. We do NOT impose
				10341	* any cq vs sq ring sizing.
				10342	*/
				10343	if (!p->cq_entries)
				10344	return -EINVAL;
				10345	if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
				10346	if (!(p->flags & IORING_SETUP_CLAMP))
				10347	return -EINVAL;
				10348	p->cq_entries = IORING_MAX_CQ_ENTRIES;
				10349	}
				10350	p->cq_entries = roundup_pow_of_two(p->cq_entries);
				10351	if (p->cq_entries < p->sq_entries)
				10352	return -EINVAL;
				10353	} else {
				10354	p->cq_entries = 2 * p->sq_entries;
				10355	}
				10356
				10357	ctx = io_ring_ctx_alloc(p);
				10358	if (!ctx)
				10359	return -ENOMEM;
				10360	ctx->compat = in_compat_syscall();
				10361	if (!capable(CAP_IPC_LOCK))
				10362	ctx->user = get_uid(current_user());
				10363
				10364	/*
				10365	* This is just grabbed for accounting purposes. When a process exits,
				10366	* the mm is exited and dropped before the files, hence we need to hang
				10367	* on to this mm purely for the purposes of being able to unaccount
				10368	* memory (locked/pinned vm). It's not used for anything else.
				10369	*/
				10370	mmgrab(current->mm);
				10371	ctx->mm_account = current->mm;
				10372
				10373	ret = io_allocate_scq_urings(ctx, p);
				10374	if (ret)
				10375	goto err;
				10376
				10377	ret = io_sq_offload_create(ctx, p);
				10378	if (ret)
				10379	goto err;
				10380	/* always set a rsrc node */
				10381	ret = io_rsrc_node_switch_start(ctx);
				10382	if (ret)
				10383	goto err;
				10384	io_rsrc_node_switch(ctx, NULL);
				10385
				10386	memset(&p->sq_off, 0, sizeof(p->sq_off));
				10387	p->sq_off.head = offsetof(struct io_rings, sq.head);
				10388	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
				10389	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
				10390	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
				10391	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
				10392	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
				10393	p->sq_off.array = (char )ctx->sq_array - (char )ctx->rings;
				10394
				10395	memset(&p->cq_off, 0, sizeof(p->cq_off));
				10396	p->cq_off.head = offsetof(struct io_rings, cq.head);
				10397	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
				10398	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
				10399	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
				10400	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
				10401	p->cq_off.cqes = offsetof(struct io_rings, cqes);
				10402	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
				10403
				10404	p->features = IORING_FEAT_SINGLE_MMAP \| IORING_FEAT_NODROP \|
				10405	IORING_FEAT_SUBMIT_STABLE \| IORING_FEAT_RW_CUR_POS \|
				10406	IORING_FEAT_CUR_PERSONALITY \| IORING_FEAT_FAST_POLL \|
				10407	IORING_FEAT_POLL_32BITS \| IORING_FEAT_SQPOLL_NONFIXED \|
				10408	IORING_FEAT_EXT_ARG \| IORING_FEAT_NATIVE_WORKERS \|
				10409	IORING_FEAT_RSRC_TAGS;
				10410
				10411	if (copy_to_user(params, p, sizeof(*p))) {
				10412	ret = -EFAULT;
				10413	goto err;
				10414	}
				10415
				10416	file = io_uring_get_file(ctx);
				10417	if (IS_ERR(file)) {
				10418	ret = PTR_ERR(file);
				10419	goto err;
				10420	}
				10421
				10422	/*
				10423	* Install ring fd as the very last thing, so we don't risk someone
				10424	* having closed it before we finish setup
				10425	*/
				10426	ret = io_uring_install_fd(ctx, file);
				10427	if (ret < 0) {
				10428	/* fput will clean it up */
				10429	fput(file);
				10430	return ret;
				10431	}
				10432
				10433	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
				10434	return ret;
				10435	err:
				10436	io_ring_ctx_wait_and_kill(ctx);
				10437	return ret;
				10438	}
				10439
				10440	/*
				10441	* Sets up an aio uring context, and returns the fd. Applications asks for a
				10442	* ring size, we return the actual sq/cq ring sizes (among other things) in the
				10443	* params structure passed in.
				10444	*/
				10445	static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
				10446	{
				10447	struct io_uring_params p;
				10448	int i;
				10449
				10450	if (copy_from_user(&p, params, sizeof(p)))
				10451	return -EFAULT;
				10452	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
				10453	if (p.resv[i])
				10454	return -EINVAL;
				10455	}
				10456
				10457	if (p.flags & ~(IORING_SETUP_IOPOLL \| IORING_SETUP_SQPOLL \|
				10458	IORING_SETUP_SQ_AFF \| IORING_SETUP_CQSIZE \|
				10459	IORING_SETUP_CLAMP \| IORING_SETUP_ATTACH_WQ \|
				10460	IORING_SETUP_R_DISABLED))
				10461	return -EINVAL;
				10462
				10463	return io_uring_create(entries, &p, params);
				10464	}
				10465
				10466	SYSCALL_DEFINE2(io_uring_setup, u32, entries,
				10467	struct io_uring_params __user *, params)
				10468	{
				10469	return io_uring_setup(entries, params);
				10470	}
				10471
				10472	static int io_probe(struct io_ring_ctx ctx, void __user arg, unsigned nr_args)
				10473	{
				10474	struct io_uring_probe *p;
				10475	size_t size;
				10476	int i, ret;
				10477
				10478	size = struct_size(p, ops, nr_args);
				10479	if (size == SIZE_MAX)
				10480	return -EOVERFLOW;
				10481	p = kzalloc(size, GFP_KERNEL);
				10482	if (!p)
				10483	return -ENOMEM;
				10484
				10485	ret = -EFAULT;
				10486	if (copy_from_user(p, arg, size))
				10487	goto out;
				10488	ret = -EINVAL;
				10489	if (memchr_inv(p, 0, size))
				10490	goto out;
				10491
				10492	p->last_op = IORING_OP_LAST - 1;
				10493	if (nr_args > IORING_OP_LAST)
				10494	nr_args = IORING_OP_LAST;
				10495
				10496	for (i = 0; i < nr_args; i++) {
				10497	p->ops[i].op = i;
				10498	if (!io_op_defs[i].not_supported)
				10499	p->ops[i].flags = IO_URING_OP_SUPPORTED;
				10500	}
				10501	p->ops_len = i;
				10502
				10503	ret = 0;
				10504	if (copy_to_user(arg, p, size))
				10505	ret = -EFAULT;
				10506	out:
				10507	kfree(p);
				10508	return ret;
				10509	}
				10510
				10511	static int io_register_personality(struct io_ring_ctx *ctx)
				10512	{
				10513	const struct cred *creds;
				10514	u32 id;
				10515	int ret;
				10516
				10517	creds = get_current_cred();
				10518
				10519	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
				10520	XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
				10521	if (ret < 0) {
				10522	put_cred(creds);
				10523	return ret;
				10524	}
				10525	return id;
				10526	}
				10527
				10528	static int io_register_restrictions(struct io_ring_ctx ctx, void __user arg,
				10529	unsigned int nr_args)
				10530	{
				10531	struct io_uring_restriction *res;
				10532	size_t size;
				10533	int i, ret;
				10534
				10535	/* Restrictions allowed only if rings started disabled */
				10536	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
				10537	return -EBADFD;
				10538
				10539	/* We allow only a single restrictions registration */
				10540	if (ctx->restrictions.registered)
				10541	return -EBUSY;
				10542
				10543	if (!arg \|\| nr_args > IORING_MAX_RESTRICTIONS)
				10544	return -EINVAL;
				10545
				10546	size = array_size(nr_args, sizeof(*res));
				10547	if (size == SIZE_MAX)
				10548	return -EOVERFLOW;
				10549
				10550	res = memdup_user(arg, size);
				10551	if (IS_ERR(res))
				10552	return PTR_ERR(res);
				10553
				10554	ret = 0;
				10555
				10556	for (i = 0; i < nr_args; i++) {
				10557	switch (res[i].opcode) {
				10558	case IORING_RESTRICTION_REGISTER_OP:
				10559	if (res[i].register_op >= IORING_REGISTER_LAST) {
				10560	ret = -EINVAL;
				10561	goto out;
				10562	}
				10563
				10564	__set_bit(res[i].register_op,
				10565	ctx->restrictions.register_op);
				10566	break;
				10567	case IORING_RESTRICTION_SQE_OP:
				10568	if (res[i].sqe_op >= IORING_OP_LAST) {
				10569	ret = -EINVAL;
				10570	goto out;
				10571	}
				10572
				10573	__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
				10574	break;
				10575	case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
				10576	ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
				10577	break;
				10578	case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
				10579	ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
				10580	break;
				10581	default:
				10582	ret = -EINVAL;
				10583	goto out;
				10584	}
				10585	}
				10586
				10587	out:
				10588	/* Reset all restrictions if an error happened */
				10589	if (ret != 0)
				10590	memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
				10591	else
				10592	ctx->restrictions.registered = true;
				10593
				10594	kfree(res);
				10595	return ret;
				10596	}
				10597
				10598	static int io_register_enable_rings(struct io_ring_ctx *ctx)
				10599	{
				10600	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
				10601	return -EBADFD;
				10602
				10603	if (ctx->restrictions.registered)
				10604	ctx->restricted = 1;
				10605
				10606	ctx->flags &= ~IORING_SETUP_R_DISABLED;
				10607	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
				10608	wake_up(&ctx->sq_data->wait);
				10609	return 0;
				10610	}
				10611
				10612	static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
				10613	struct io_uring_rsrc_update2 *up,
				10614	unsigned nr_args)
				10615	{
				10616	__u32 tmp;
				10617	int err;
				10618
				10619	if (check_add_overflow(up->offset, nr_args, &tmp))
				10620	return -EOVERFLOW;
				10621	err = io_rsrc_node_switch_start(ctx);
				10622	if (err)
				10623	return err;
				10624
				10625	switch (type) {
				10626	case IORING_RSRC_FILE:
				10627	return __io_sqe_files_update(ctx, up, nr_args);
				10628	case IORING_RSRC_BUFFER:
				10629	return __io_sqe_buffers_update(ctx, up, nr_args);
				10630	}
				10631	return -EINVAL;
				10632	}
				10633
				10634	static int io_register_files_update(struct io_ring_ctx ctx, void __user arg,
				10635	unsigned nr_args)
				10636	{
				10637	struct io_uring_rsrc_update2 up;
				10638
				10639	if (!nr_args)
				10640	return -EINVAL;
				10641	memset(&up, 0, sizeof(up));
				10642	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
				10643	return -EFAULT;
				10644	if (up.resv \|\| up.resv2)
				10645	return -EINVAL;
				10646	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
				10647	}
				10648
				10649	static int io_register_rsrc_update(struct io_ring_ctx ctx, void __user arg,
				10650	unsigned size, unsigned type)
				10651	{
				10652	struct io_uring_rsrc_update2 up;
				10653
				10654	if (size != sizeof(up))
				10655	return -EINVAL;
				10656	if (copy_from_user(&up, arg, sizeof(up)))
				10657	return -EFAULT;
				10658	if (!up.nr \|\| up.resv \|\| up.resv2)
				10659	return -EINVAL;
				10660	return __io_register_rsrc_update(ctx, type, &up, up.nr);
				10661	}
				10662
				10663	static int io_register_rsrc(struct io_ring_ctx ctx, void __user arg,
				10664	unsigned int size, unsigned int type)
				10665	{
				10666	struct io_uring_rsrc_register rr;
				10667
				10668	/* keep it extendible */
				10669	if (size != sizeof(rr))
				10670	return -EINVAL;
				10671
				10672	memset(&rr, 0, sizeof(rr));
				10673	if (copy_from_user(&rr, arg, size))
				10674	return -EFAULT;
				10675	if (!rr.nr \|\| rr.resv \|\| rr.resv2)
				10676	return -EINVAL;
				10677
				10678	switch (type) {
				10679	case IORING_RSRC_FILE:
				10680	return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
				10681	rr.nr, u64_to_user_ptr(rr.tags));
				10682	case IORING_RSRC_BUFFER:
				10683	return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
				10684	rr.nr, u64_to_user_ptr(rr.tags));
				10685	}
				10686	return -EINVAL;
				10687	}
				10688
				10689	static int io_register_iowq_aff(struct io_ring_ctx ctx, void __user arg,
				10690	unsigned len)
				10691	{
				10692	struct io_uring_task *tctx = current->io_uring;
				10693	cpumask_var_t new_mask;
				10694	int ret;
				10695
				10696	if (!tctx \|\| !tctx->io_wq)
				10697	return -EINVAL;
				10698
				10699	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
				10700	return -ENOMEM;
				10701
				10702	cpumask_clear(new_mask);
				10703	if (len > cpumask_size())
				10704	len = cpumask_size();
				10705
				10706	#ifdef CONFIG_COMPAT
				10707	if (in_compat_syscall()) {
				10708	ret = compat_get_bitmap(cpumask_bits(new_mask),
				10709	(const compat_ulong_t __user *)arg,
				10710	len * 8 /* CHAR_BIT */);
				10711	} else {
				10712	ret = copy_from_user(new_mask, arg, len);
				10713	}
				10714	#else
				10715	ret = copy_from_user(new_mask, arg, len);
				10716	#endif
				10717
				10718	if (ret) {
				10719	free_cpumask_var(new_mask);
				10720	return -EFAULT;
				10721	}
				10722
				10723	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
				10724	free_cpumask_var(new_mask);
				10725	return ret;
				10726	}
				10727
				10728	static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
				10729	{
				10730	struct io_uring_task *tctx = current->io_uring;
				10731
				10732	if (!tctx \|\| !tctx->io_wq)
				10733	return -EINVAL;
				10734
				10735	return io_wq_cpu_affinity(tctx->io_wq, NULL);
				10736	}
				10737
				10738	static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
				10739	void __user *arg)
				10740	__must_hold(&ctx->uring_lock)
				10741	{
				10742	struct io_tctx_node *node;
				10743	struct io_uring_task *tctx = NULL;
				10744	struct io_sq_data *sqd = NULL;
				10745	__u32 new_count[2];
				10746	int i, ret;
				10747
				10748	if (copy_from_user(new_count, arg, sizeof(new_count)))
				10749	return -EFAULT;
				10750	for (i = 0; i < ARRAY_SIZE(new_count); i++)
				10751	if (new_count[i] > INT_MAX)
				10752	return -EINVAL;
				10753
				10754	if (ctx->flags & IORING_SETUP_SQPOLL) {
				10755	sqd = ctx->sq_data;
				10756	if (sqd) {
				10757	/*
				10758	* Observe the correct sqd->lock -> ctx->uring_lock
				10759	* ordering. Fine to drop uring_lock here, we hold
				10760	* a ref to the ctx.
				10761	*/
				10762	refcount_inc(&sqd->refs);
				10763	mutex_unlock(&ctx->uring_lock);
				10764	mutex_lock(&sqd->lock);
				10765	mutex_lock(&ctx->uring_lock);
				10766	if (sqd->thread)
				10767	tctx = sqd->thread->io_uring;
				10768	}
				10769	} else {
				10770	tctx = current->io_uring;
				10771	}
				10772
				10773	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
				10774
				10775	for (i = 0; i < ARRAY_SIZE(new_count); i++)
				10776	if (new_count[i])
				10777	ctx->iowq_limits[i] = new_count[i];
				10778	ctx->iowq_limits_set = true;
				10779
				10780	ret = -EINVAL;
				10781	if (tctx && tctx->io_wq) {
				10782	ret = io_wq_max_workers(tctx->io_wq, new_count);
				10783	if (ret)
				10784	goto err;
				10785	} else {
				10786	memset(new_count, 0, sizeof(new_count));
				10787	}
				10788
				10789	if (sqd) {
				10790	mutex_unlock(&sqd->lock);
				10791	io_put_sq_data(sqd);
				10792	}
				10793
				10794	if (copy_to_user(arg, new_count, sizeof(new_count)))
				10795	return -EFAULT;
				10796
				10797	/* that's it for SQPOLL, only the SQPOLL task creates requests */
				10798	if (sqd)
				10799	return 0;
				10800
				10801	/* now propagate the restriction to all registered users */
				10802	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
				10803	struct io_uring_task *tctx = node->task->io_uring;
				10804
				10805	if (WARN_ON_ONCE(!tctx->io_wq))
				10806	continue;
				10807
				10808	for (i = 0; i < ARRAY_SIZE(new_count); i++)
				10809	new_count[i] = ctx->iowq_limits[i];
				10810	/* ignore errors, it always returns zero anyway */
				10811	(void)io_wq_max_workers(tctx->io_wq, new_count);
				10812	}
				10813	return 0;
				10814	err:
				10815	if (sqd) {
				10816	mutex_unlock(&sqd->lock);
				10817	io_put_sq_data(sqd);
				10818	}
				10819	return ret;
				10820	}
				10821
				10822	static bool io_register_op_must_quiesce(int op)
				10823	{
				10824	switch (op) {
				10825	case IORING_REGISTER_BUFFERS:
				10826	case IORING_UNREGISTER_BUFFERS:
				10827	case IORING_REGISTER_FILES:
				10828	case IORING_UNREGISTER_FILES:
				10829	case IORING_REGISTER_FILES_UPDATE:
				10830	case IORING_REGISTER_PROBE:
				10831	case IORING_REGISTER_PERSONALITY:
				10832	case IORING_UNREGISTER_PERSONALITY:
				10833	case IORING_REGISTER_FILES2:
				10834	case IORING_REGISTER_FILES_UPDATE2:
				10835	case IORING_REGISTER_BUFFERS2:
				10836	case IORING_REGISTER_BUFFERS_UPDATE:
				10837	case IORING_REGISTER_IOWQ_AFF:
				10838	case IORING_UNREGISTER_IOWQ_AFF:
				10839	case IORING_REGISTER_IOWQ_MAX_WORKERS:
				10840	return false;
				10841	default:
				10842	return true;
				10843	}
				10844	}
				10845
				10846	static int io_ctx_quiesce(struct io_ring_ctx *ctx)
				10847	{
				10848	long ret;
				10849
				10850	percpu_ref_kill(&ctx->refs);
				10851
				10852	/*
				10853	* Drop uring mutex before waiting for references to exit. If another
				10854	* thread is currently inside io_uring_enter() it might need to grab the
				10855	* uring_lock to make progress. If we hold it here across the drain
				10856	* wait, then we can deadlock. It's safe to drop the mutex here, since
				10857	* no new references will come in after we've killed the percpu ref.
				10858	*/
				10859	mutex_unlock(&ctx->uring_lock);
				10860	do {
				10861	ret = wait_for_completion_interruptible(&ctx->ref_comp);
				10862	if (!ret)
				10863	break;
				10864	ret = io_run_task_work_sig();
				10865	} while (ret >= 0);
				10866	mutex_lock(&ctx->uring_lock);
				10867
				10868	if (ret)
				10869	io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
				10870	return ret;
				10871	}
				10872
				10873	static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
				10874	void __user *arg, unsigned nr_args)
				10875	__releases(ctx->uring_lock)
				10876	__acquires(ctx->uring_lock)
				10877	{
				10878	int ret;
				10879
				10880	/*
				10881	* We're inside the ring mutex, if the ref is already dying, then
				10882	* someone else killed the ctx or is already going through
				10883	* io_uring_register().
				10884	*/
				10885	if (percpu_ref_is_dying(&ctx->refs))
				10886	return -ENXIO;
				10887
				10888	if (ctx->restricted) {
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	10889	opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
				10890	if (!test_bit(opcode, ctx->restrictions.register_op))
				10891	return -EACCES;
				10892	}
				10893
				10894	if (io_register_op_must_quiesce(opcode)) {
				10895	ret = io_ctx_quiesce(ctx);
				10896	if (ret)
				10897	return ret;
				10898	}
				10899
				10900	switch (opcode) {
				10901	case IORING_REGISTER_BUFFERS:
				10902	ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
				10903	break;
				10904	case IORING_UNREGISTER_BUFFERS:
				10905	ret = -EINVAL;
				10906	if (arg \|\| nr_args)
				10907	break;
				10908	ret = io_sqe_buffers_unregister(ctx);
				10909	break;
				10910	case IORING_REGISTER_FILES:
				10911	ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
				10912	break;
				10913	case IORING_UNREGISTER_FILES:
				10914	ret = -EINVAL;
				10915	if (arg \|\| nr_args)
				10916	break;
				10917	ret = io_sqe_files_unregister(ctx);
				10918	break;
				10919	case IORING_REGISTER_FILES_UPDATE:
				10920	ret = io_register_files_update(ctx, arg, nr_args);
				10921	break;
				10922	case IORING_REGISTER_EVENTFD:
				10923	case IORING_REGISTER_EVENTFD_ASYNC:
				10924	ret = -EINVAL;
				10925	if (nr_args != 1)
				10926	break;
				10927	ret = io_eventfd_register(ctx, arg);
				10928	if (ret)
				10929	break;
				10930	if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
				10931	ctx->eventfd_async = 1;
				10932	else
				10933	ctx->eventfd_async = 0;
				10934	break;
				10935	case IORING_UNREGISTER_EVENTFD:
				10936	ret = -EINVAL;
				10937	if (arg \|\| nr_args)
				10938	break;
				10939	ret = io_eventfd_unregister(ctx);
				10940	break;
				10941	case IORING_REGISTER_PROBE:
				10942	ret = -EINVAL;
				10943	if (!arg \|\| nr_args > 256)
				10944	break;
				10945	ret = io_probe(ctx, arg, nr_args);
				10946	break;
				10947	case IORING_REGISTER_PERSONALITY:
				10948	ret = -EINVAL;
				10949	if (arg \|\| nr_args)
				10950	break;
				10951	ret = io_register_personality(ctx);
				10952	break;
				10953	case IORING_UNREGISTER_PERSONALITY:
				10954	ret = -EINVAL;
				10955	if (arg)
				10956	break;
				10957	ret = io_unregister_personality(ctx, nr_args);
				10958	break;
				10959	case IORING_REGISTER_ENABLE_RINGS:
				10960	ret = -EINVAL;
				10961	if (arg \|\| nr_args)
				10962	break;
				10963	ret = io_register_enable_rings(ctx);
				10964	break;
				10965	case IORING_REGISTER_RESTRICTIONS:
				10966	ret = io_register_restrictions(ctx, arg, nr_args);
				10967	break;
				10968	case IORING_REGISTER_FILES2:
				10969	ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
				10970	break;
				10971	case IORING_REGISTER_FILES_UPDATE2:
				10972	ret = io_register_rsrc_update(ctx, arg, nr_args,
				10973	IORING_RSRC_FILE);
				10974	break;
				10975	case IORING_REGISTER_BUFFERS2:
				10976	ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
				10977	break;
				10978	case IORING_REGISTER_BUFFERS_UPDATE:
				10979	ret = io_register_rsrc_update(ctx, arg, nr_args,
				10980	IORING_RSRC_BUFFER);
				10981	break;
				10982	case IORING_REGISTER_IOWQ_AFF:
				10983	ret = -EINVAL;
				10984	if (!arg \|\| !nr_args)
				10985	break;
				10986	ret = io_register_iowq_aff(ctx, arg, nr_args);
				10987	break;
				10988	case IORING_UNREGISTER_IOWQ_AFF:
				10989	ret = -EINVAL;
				10990	if (arg \|\| nr_args)
				10991	break;
				10992	ret = io_unregister_iowq_aff(ctx);
				10993	break;
				10994	case IORING_REGISTER_IOWQ_MAX_WORKERS:
				10995	ret = -EINVAL;
				10996	if (!arg \|\| nr_args != 2)
				10997	break;
				10998	ret = io_register_iowq_max_workers(ctx, arg);
				10999	break;
				11000	default:
				11001	ret = -EINVAL;
				11002	break;
				11003	}
				11004
				11005	if (io_register_op_must_quiesce(opcode)) {
				11006	/* bring the ctx back to life */
				11007	percpu_ref_reinit(&ctx->refs);
				11008	reinit_completion(&ctx->ref_comp);
				11009	}
				11010	return ret;
				11011	}
				11012
				11013	SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
				11014	void __user *, arg, unsigned int, nr_args)
				11015	{
				11016	struct io_ring_ctx *ctx;
				11017	long ret = -EBADF;
				11018	struct fd f;
				11019
Jens Axboe	673831e	2022-12-23 06:37:08 -0700	[diff] [blame]	11020	if (opcode >= IORING_REGISTER_LAST)
				11021	return -EINVAL;
				11022
Jens Axboe	76050cd	2022-12-22 14:30:11 -0700	[diff] [blame]	11023	f = fdget(fd);
				11024	if (!f.file)
				11025	return -EBADF;
				11026
				11027	ret = -EOPNOTSUPP;
				11028	if (f.file->f_op != &io_uring_fops)
				11029	goto out_fput;
				11030
				11031	ctx = f.file->private_data;
				11032
				11033	io_run_task_work();
				11034
				11035	mutex_lock(&ctx->uring_lock);
				11036	ret = __io_uring_register(ctx, opcode, arg, nr_args);
				11037	mutex_unlock(&ctx->uring_lock);
				11038	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
				11039	ctx->cq_ev_fd != NULL, ret);
				11040	out_fput:
				11041	fdput(f);
				11042	return ret;
				11043	}
				11044
				11045	static int __init io_uring_init(void)
				11046	{
				11047	#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
				11048	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
				11049	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
				11050	} while (0)
				11051
				11052	#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
				11053	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
				11054	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
				11055	BUILD_BUG_SQE_ELEM(0, __u8, opcode);
				11056	BUILD_BUG_SQE_ELEM(1, __u8, flags);
				11057	BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
				11058	BUILD_BUG_SQE_ELEM(4, __s32, fd);
				11059	BUILD_BUG_SQE_ELEM(8, __u64, off);
				11060	BUILD_BUG_SQE_ELEM(8, __u64, addr2);
				11061	BUILD_BUG_SQE_ELEM(16, __u64, addr);
				11062	BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
				11063	BUILD_BUG_SQE_ELEM(24, __u32, len);
				11064	BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
				11065	BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
				11066	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
				11067	BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
				11068	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
				11069	BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
				11070	BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
				11071	BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
				11072	BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
				11073	BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
				11074	BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
				11075	BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
				11076	BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
				11077	BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
				11078	BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
				11079	BUILD_BUG_SQE_ELEM(32, __u64, user_data);
				11080	BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
				11081	BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
				11082	BUILD_BUG_SQE_ELEM(42, __u16, personality);
				11083	BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
				11084	BUILD_BUG_SQE_ELEM(44, __u32, file_index);
				11085
				11086	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
				11087	sizeof(struct io_uring_rsrc_update));
				11088	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
				11089	sizeof(struct io_uring_rsrc_update2));
				11090
				11091	/* ->buf_index is u16 */
				11092	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
				11093
				11094	/* should fit into one byte */
				11095	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
				11096
				11097	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
				11098	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
				11099
				11100	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
				11101	SLAB_ACCOUNT);
				11102	return 0;
				11103	};
				11104	__initcall(io_uring_init);