diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6d3ee71bd832..841579dcdae9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -415,6 +415,13 @@ struct io_ring_ctx { /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + /* + * Protection for resize vs mmap races - both the mmap and resize + * side will need to grab this lock, to prevent either side from + * being run concurrently with the other. + */ + struct mutex resize_lock; + /* * If IORING_SETUP_NO_MMAP is used, then the below holds * the gup'ed pages for the two rings, and the sqes. diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 86cb385fe0b5..60b9c98595fa 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -615,6 +615,11 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, + /* 32 reserved for zc rx */ + + /* resize CQ ring */ + IORING_REGISTER_RESIZE_RINGS = 33, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b5974bdad48b..140cd47fbdb3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); + mutex_init(&ctx->resize_lock); return ctx; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index d614824e17bd..85c66fa54956 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) unsigned int npages; void *ptr; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); @@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = filp->private_data; void *ptr; /* @@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; @@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = file->private_data; void *ptr; + guard(mutex)(&ctx->resize_lock); + ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); diff --git a/io_uring/register.c b/io_uring/register.c index 52b2f9b74af8..fc6c94d694b2 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -29,6 +29,7 @@ #include "napi.h" #include "eventfd.h" #include "msg_ring.h" +#include "memmap.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx, return 0; } +/* + * State to maintain until we can swap. Both new and old state, used for + * either mapping or freeing. + */ +struct io_ring_ctx_rings { + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_uring_sqe *sq_sqes; + struct io_rings *rings; +}; + +static void io_register_free_rings(struct io_uring_params *p, + struct io_ring_ctx_rings *r) +{ + if (!(p->flags & IORING_SETUP_NO_MMAP)) { + io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, + true); + io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, + true); + } else { + io_pages_free(&r->ring_pages, r->n_ring_pages); + io_pages_free(&r->sqe_pages, r->n_sqe_pages); + vunmap(r->rings); + vunmap(r->sq_sqes); + } +} + +#define swap_old(ctx, o, n, field) \ + do { \ + (o).field = (ctx)->field; \ + (ctx)->field = (n).field; \ + } while (0) + +#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) +#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + +static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; + size_t size, sq_array_offset; + struct io_uring_params p; + unsigned i, tail; + void *ptr; + int ret; + + /* for single issuer, must be owner resizing */ + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && + current != ctx->submitter_task) + return -EEXIST; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + if (p.flags & ~RESIZE_FLAGS) + return -EINVAL; + + /* properties that are always inherited */ + p.flags |= (ctx->flags & COPY_FLAGS); + + ret = io_uring_fill_params(p.sq_entries, &p); + if (unlikely(ret)) + return ret; + + /* nothing to do, but copy params back */ + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { + if (copy_to_user(arg, &p, sizeof(p))) + return -EFAULT; + return 0; + } + + size = rings_size(p.flags, p.sq_entries, p.cq_entries, + &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + if (!(p.flags & IORING_SETUP_NO_MMAP)) + n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); + else + n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, + p.cq_off.user_addr, size); + if (IS_ERR(n.rings)) + return PTR_ERR(n.rings); + + n.rings->sq_ring_mask = p.sq_entries - 1; + n.rings->cq_ring_mask = p.cq_entries - 1; + n.rings->sq_ring_entries = p.sq_entries; + n.rings->cq_ring_entries = p.cq_entries; + + if (copy_to_user(arg, &p, sizeof(p))) { + io_register_free_rings(&p, &n); + return -EFAULT; + } + + if (p.flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); + if (size == SIZE_MAX) { + io_register_free_rings(&p, &n); + return -EOVERFLOW; + } + + if (!(p.flags & IORING_SETUP_NO_MMAP)) + ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); + else + ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, + p.sq_off.user_addr, + size); + if (IS_ERR(ptr)) { + io_register_free_rings(&p, &n); + return PTR_ERR(ptr); + } + + /* + * If using SQPOLL, park the thread + */ + if (ctx->sq_data) { + mutex_unlock(&ctx->uring_lock); + io_sq_thread_park(ctx->sq_data); + mutex_lock(&ctx->uring_lock); + } + + /* + * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * any new mmap's on the ring fd. Clear out existing mappings to prevent + * mmap from seeing them, as we'll unmap them. Any attempt to mmap + * existing rings beyond this point will fail. Not that it could proceed + * at this point anyway, as the io_uring mmap side needs go grab the + * ctx->resize_lock as well. Likewise, hold the completion lock over the + * duration of the actual swap. + */ + mutex_lock(&ctx->resize_lock); + spin_lock(&ctx->completion_lock); + o.rings = ctx->rings; + ctx->rings = NULL; + o.sq_sqes = ctx->sq_sqes; + ctx->sq_sqes = NULL; + + /* + * Now copy SQ and CQ entries, if any. If either of the destination + * rings can't hold what is already there, then fail the operation. + */ + n.sq_sqes = ptr; + tail = o.rings->sq.tail; + if (tail - o.rings->sq.head > p.sq_entries) + goto overflow; + for (i = o.rings->sq.head; i < tail; i++) { + unsigned src_head = i & (ctx->sq_entries - 1); + unsigned dst_head = i & n.rings->sq_ring_mask; + + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + } + n.rings->sq.head = o.rings->sq.head; + n.rings->sq.tail = o.rings->sq.tail; + + tail = o.rings->cq.tail; + if (tail - o.rings->cq.head > p.cq_entries) { +overflow: + /* restore old rings, and return -EOVERFLOW via cleanup path */ + ctx->rings = o.rings; + ctx->sq_sqes = o.sq_sqes; + to_free = &n; + ret = -EOVERFLOW; + goto out; + } + for (i = o.rings->cq.head; i < tail; i++) { + unsigned src_head = i & (ctx->cq_entries - 1); + unsigned dst_head = i & n.rings->cq_ring_mask; + + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + } + n.rings->cq.head = o.rings->cq.head; + n.rings->cq.tail = o.rings->cq.tail; + /* invalidate cached cqe refill */ + ctx->cqe_cached = ctx->cqe_sentinel = NULL; + + n.rings->sq_dropped = o.rings->sq_dropped; + n.rings->sq_flags = o.rings->sq_flags; + n.rings->cq_flags = o.rings->cq_flags; + n.rings->cq_overflow = o.rings->cq_overflow; + + /* all done, store old pointers and assign new ones */ + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + + ctx->sq_entries = p.sq_entries; + ctx->cq_entries = p.cq_entries; + + ctx->rings = n.rings; + ctx->sq_sqes = n.sq_sqes; + swap_old(ctx, o, n, n_ring_pages); + swap_old(ctx, o, n, n_sqe_pages); + swap_old(ctx, o, n, ring_pages); + swap_old(ctx, o, n, sqe_pages); + to_free = &o; + ret = 0; +out: + spin_unlock(&ctx->completion_lock); + mutex_unlock(&ctx->resize_lock); + io_register_free_rings(&p, to_free); + + if (ctx->sq_data) + io_sq_thread_unpark(ctx->sq_data); + + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_RESIZE_RINGS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_resize_rings(ctx, arg); + break; default: ret = -EINVAL; break;