mirror of
https://kernel.googlesource.com/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-22 00:14:18 +03:00
for-6.14/io_uring-20250119
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmeNDEUQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpl5hD/4t7kWWNQDeQG9CiA3QStMJ5Yow2AgYtK8f sJBr5/6PGEsbTreX//Kh8DtPZPRGcjG9elCo58QxWaPZ2mg3fTOR3/QYLMlaGXU2 hSht58lj32utpuzMjMo9bG3aesi03bLf+buaq7V1FaMlcTV8rXqK1s/HGtphDBRo 8tNLEk3JDJDs3vlWbNp/5Hqh9+Ro6DU8df1zWWH4Vbu8RXaGIPyJyjKvvcbfuuCf k7Ay45XNAmTZg+rSNGv1H3Yn1LNzPMVFLWBfzRahPCzlKy2+mJMWz1PWu9naaUK+ WTM+kgiBLF24k59G/9xuxC5bYtsTjTbr4GsEE5ZvFBnhKPzLzzaJj7iQHRj83vtv tqxNmAbA3wJoNk48Zr8+cYbfDX9Q9Pl32wIaS/LxRgF9MT4lem6pyKY7Skd12oK3 rnQ8moGtnOBxp3QUU6BZ7IX3ipb+Bgw7FhZbtVYJdlqKeKyi1QO0MuITwGXpMwk/ EWDDTsspIf+QaTu+fmO8byJavugKljW8t7hM1JpvlfOLl+rsh6/+AYz42fCvcaA0 Tu4bpUk8SuwALvZfU2R6bLkorGG6MFuGI8g3eixOcGir3YAcHBMfdg6ItpZi5qVt ToM87BMaezOZZvSwX1JBaQ0AR5HBQYmHaiLWgPsORf3PjJ0kz+u21SK9D+yJkUtU rT6+HvoVXA== =ufpE -----END PGP SIGNATURE----- Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: "Not a lot in terms of features this time around, mostly just cleanups and code consolidation: - Support for PI meta data read/write via io_uring, with NVMe and SCSI covered - Cleanup the per-op structure caching, making it consistent across various command types - Consolidate the various user mapped features into a concept called regions, making the various users of that consistent - Various cleanups and fixes" * tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits) io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname io_uring: reuse io_should_terminate_tw() for cmds io_uring: Factor out a function to parse restrictions io_uring/rsrc: require cloned buffers to share accounting contexts io_uring: simplify the SQPOLL thread check when cancelling requests io_uring: expose read/write attribute capability io_uring/rw: don't gate retry on completion context io_uring/rw: handle -EAGAIN retry at IO completion time io_uring/rw: use io_rw_recycle() from cleanup path io_uring/rsrc: simplify the bvec iter count calculation io_uring: ensure io_queue_deferred() is out-of-line io_uring/rw: always clear ->bytes_done on io_async_rw setup io_uring/rw: use NULL for rw->free_iovec assigment io_uring/rw: don't mask in f_iocb_flags io_uring/msg_ring: Drop custom destructor io_uring: Move old async data allocation helper to header io_uring/rw: Allocate async data through helper io_uring/net: Allocate msghdr async data through helper io_uring/uring_cmd: Allocate async data through generic helper io_uring/poll: Allocate apoll with generic alloc_cache helper ...
This commit is contained in:
commit
a312e1706c
@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
|
||||
|
||||
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
|
||||
{
|
||||
unsigned short nr_vecs = bip->bip_max_vcnt - 1;
|
||||
struct bio_vec *copy = &bip->bip_vec[1];
|
||||
size_t bytes = bip->bip_iter.bi_size;
|
||||
struct iov_iter iter;
|
||||
unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
|
||||
struct bio_vec *orig_bvecs = &bip->bip_vec[1];
|
||||
struct bio_vec *bounce_bvec = &bip->bip_vec[0];
|
||||
size_t bytes = bounce_bvec->bv_len;
|
||||
struct iov_iter orig_iter;
|
||||
int ret;
|
||||
|
||||
iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
|
||||
ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
|
||||
iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
|
||||
ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
|
||||
WARN_ON_ONCE(ret != bytes);
|
||||
|
||||
bio_integrity_unpin_bvec(copy, nr_vecs, true);
|
||||
bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
|
||||
return nr_bvecs;
|
||||
}
|
||||
|
||||
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
|
||||
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
|
||||
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
|
||||
size_t offset, bytes = iter->count;
|
||||
unsigned int direction, nr_bvecs;
|
||||
struct iov_iter iter;
|
||||
int ret, nr_vecs;
|
||||
size_t offset;
|
||||
bool copy;
|
||||
|
||||
if (bio_integrity(bio))
|
||||
@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
else
|
||||
direction = ITER_SOURCE;
|
||||
|
||||
iov_iter_ubuf(&iter, direction, ubuf, bytes);
|
||||
nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
|
||||
nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
|
||||
if (nr_vecs > BIO_MAX_VECS)
|
||||
return -E2BIG;
|
||||
if (nr_vecs > UIO_FASTIOV) {
|
||||
@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
pages = NULL;
|
||||
}
|
||||
|
||||
copy = !iov_iter_is_aligned(&iter, align, align);
|
||||
ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
|
||||
copy = !iov_iter_is_aligned(iter, align, align);
|
||||
ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
|
||||
if (unlikely(ret < 0))
|
||||
goto free_bvec;
|
||||
|
||||
@ -365,6 +364,55 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
|
||||
{
|
||||
struct bio_integrity_payload *bip = bio_integrity(bio);
|
||||
|
||||
if (meta->flags & IO_INTEGRITY_CHK_GUARD)
|
||||
bip->bip_flags |= BIP_CHECK_GUARD;
|
||||
if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
|
||||
bip->bip_flags |= BIP_CHECK_APPTAG;
|
||||
if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
|
||||
bip->bip_flags |= BIP_CHECK_REFTAG;
|
||||
|
||||
bip->app_tag = meta->app_tag;
|
||||
}
|
||||
|
||||
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
|
||||
{
|
||||
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
|
||||
unsigned int integrity_bytes;
|
||||
int ret;
|
||||
struct iov_iter it;
|
||||
|
||||
if (!bi)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* original meta iterator can be bigger.
|
||||
* process integrity info corresponding to current data buffer only.
|
||||
*/
|
||||
it = meta->iter;
|
||||
integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
|
||||
if (it.count < integrity_bytes)
|
||||
return -EINVAL;
|
||||
|
||||
/* should fit into two bytes */
|
||||
BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
|
||||
|
||||
if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
|
||||
return -EINVAL;
|
||||
|
||||
it.count = integrity_bytes;
|
||||
ret = bio_integrity_map_user(bio, &it);
|
||||
if (!ret) {
|
||||
bio_uio_meta_to_bip(bio, meta);
|
||||
bip_set_seed(bio_integrity(bio), meta->seed);
|
||||
iov_iter_advance(&meta->iter, integrity_bytes);
|
||||
meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_integrity_prep - Prepare bio for integrity I/O
|
||||
* @bio: bio to prepare
|
||||
@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio)
|
||||
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
|
||||
bip->bip_flags |= BIP_IP_CHECKSUM;
|
||||
|
||||
/* describe what tags to check in payload */
|
||||
if (bi->csum_type)
|
||||
bip->bip_flags |= BIP_CHECK_GUARD;
|
||||
if (bi->flags & BLK_INTEGRITY_REF_TAG)
|
||||
bip->bip_flags |= BIP_CHECK_REFTAG;
|
||||
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
|
||||
offset_in_page(buf)) < len) {
|
||||
printk(KERN_ERR "could not attach integrity payload\n");
|
||||
@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
|
||||
|
||||
bip->bip_vec = bip_src->bip_vec;
|
||||
bip->bip_iter = bip_src->bip_iter;
|
||||
bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
|
||||
bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
|
||||
bip->app_tag = bip_src->app_tag;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
|
||||
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
|
||||
ssize_t bytes)
|
||||
{
|
||||
int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
|
||||
int ret;
|
||||
struct iov_iter iter;
|
||||
unsigned int direction;
|
||||
|
||||
if (op_is_write(req_op(rq)))
|
||||
direction = ITER_DEST;
|
||||
else
|
||||
direction = ITER_SOURCE;
|
||||
iov_iter_ubuf(&iter, direction, ubuf, bytes);
|
||||
ret = bio_integrity_map_user(rq->bio, &iter);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
45
block/fops.c
45
block/fops.c
@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
struct bio bio;
|
||||
ssize_t ret;
|
||||
|
||||
WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
|
||||
if (nr_pages <= DIO_INLINE_BIO_VECS)
|
||||
vecs = inline_vecs;
|
||||
else {
|
||||
@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio)
|
||||
{
|
||||
struct blkdev_dio *dio = bio->bi_private;
|
||||
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
|
||||
bool is_sync = dio->flags & DIO_IS_SYNC;
|
||||
|
||||
if (bio->bi_status && !dio->bio.bi_status)
|
||||
dio->bio.bi_status = bio->bi_status;
|
||||
|
||||
if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
|
||||
bio_integrity_unmap_user(bio);
|
||||
|
||||
if (atomic_dec_and_test(&dio->ref)) {
|
||||
if (!(dio->flags & DIO_IS_SYNC)) {
|
||||
if (!is_sync) {
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
ssize_t ret;
|
||||
|
||||
@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
* a retry of this from blocking context.
|
||||
*/
|
||||
if (unlikely(iov_iter_count(iter))) {
|
||||
bio_release_pages(bio, false);
|
||||
bio_clear_flag(bio, BIO_REFFED);
|
||||
bio_put(bio);
|
||||
blk_finish_plug(&plug);
|
||||
return -EAGAIN;
|
||||
ret = -EAGAIN;
|
||||
goto fail;
|
||||
}
|
||||
bio->bi_opf |= REQ_NOWAIT;
|
||||
}
|
||||
if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
|
||||
ret = bio_integrity_map_iter(bio, iocb->private);
|
||||
if (unlikely(ret))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (is_read) {
|
||||
if (dio->flags & DIO_SHOULD_DIRTY)
|
||||
@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
bio_put(&dio->bio);
|
||||
return ret;
|
||||
fail:
|
||||
bio_release_pages(bio, false);
|
||||
bio_clear_flag(bio, BIO_REFFED);
|
||||
bio_put(bio);
|
||||
blk_finish_plug(&plug);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void blkdev_bio_end_io_async(struct bio *bio)
|
||||
@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
|
||||
ret = blk_status_to_errno(bio->bi_status);
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_HAS_METADATA)
|
||||
bio_integrity_unmap_user(bio);
|
||||
|
||||
iocb->ki_complete(iocb, ret);
|
||||
|
||||
if (dio->flags & DIO_SHOULD_DIRTY) {
|
||||
@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
bio_iov_bvec_set(bio, iter);
|
||||
} else {
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (unlikely(ret)) {
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
if (unlikely(ret))
|
||||
goto out_bio_put;
|
||||
}
|
||||
dio->size = bio->bi_iter.bi_size;
|
||||
|
||||
@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_HAS_METADATA) {
|
||||
ret = bio_integrity_map_iter(bio, iocb->private);
|
||||
WRITE_ONCE(iocb->private, NULL);
|
||||
if (unlikely(ret))
|
||||
goto out_bio_put;
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_ATOMIC)
|
||||
bio->bi_opf |= REQ_ATOMIC;
|
||||
|
||||
@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
submit_bio(bio);
|
||||
}
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
out_bio_put:
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
|
@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
|
||||
{
|
||||
cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
|
||||
cmnd->rw.lbatm = cpu_to_le16(0xffff);
|
||||
}
|
||||
|
||||
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
|
||||
struct request *req)
|
||||
{
|
||||
@ -1017,18 +1023,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
control |= NVME_RW_PRINFO_PRACT;
|
||||
}
|
||||
|
||||
switch (ns->head->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE3:
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
||||
break;
|
||||
case NVME_NS_DPS_PI_TYPE1:
|
||||
case NVME_NS_DPS_PI_TYPE2:
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
||||
NVME_RW_PRINFO_PRCHK_REF;
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
|
||||
control |= NVME_RW_PRINFO_PRCHK_REF;
|
||||
if (op == nvme_cmd_zone_append)
|
||||
control |= NVME_RW_APPEND_PIREMAP;
|
||||
nvme_set_ref_tag(ns, cmnd, req);
|
||||
break;
|
||||
}
|
||||
if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
|
||||
control |= NVME_RW_PRINFO_PRCHK_APP;
|
||||
nvme_set_app_tag(req, cmnd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -809,14 +809,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
|
||||
if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
|
||||
scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;
|
||||
|
||||
if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
|
||||
if (bio_integrity_flagged(bio, BIP_CHECK_GUARD))
|
||||
scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
|
||||
}
|
||||
|
||||
if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */
|
||||
scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;
|
||||
|
||||
if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
|
||||
if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG))
|
||||
scmd->prot_flags |= SCSI_PROT_REF_CHECK;
|
||||
}
|
||||
|
||||
|
@ -7,10 +7,12 @@
|
||||
enum bip_flags {
|
||||
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
|
||||
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
|
||||
BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */
|
||||
BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */
|
||||
BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */
|
||||
BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */
|
||||
BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */
|
||||
BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */
|
||||
BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */
|
||||
BIP_CHECK_GUARD = 1 << 5, /* guard check */
|
||||
BIP_CHECK_REFTAG = 1 << 6, /* reftag check */
|
||||
BIP_CHECK_APPTAG = 1 << 7, /* apptag check */
|
||||
};
|
||||
|
||||
struct bio_integrity_payload {
|
||||
@ -21,6 +23,7 @@ struct bio_integrity_payload {
|
||||
unsigned short bip_vcnt; /* # of integrity bio_vecs */
|
||||
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
|
||||
unsigned short bip_flags; /* control flags */
|
||||
u16 app_tag; /* application tag value */
|
||||
|
||||
struct bvec_iter bio_iter; /* for rewinding parent bio */
|
||||
|
||||
@ -30,6 +33,9 @@ struct bio_integrity_payload {
|
||||
struct bio_vec bip_inline_vecs[];/* embedded bvec array */
|
||||
};
|
||||
|
||||
#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \
|
||||
BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
|
||||
#define bip_for_each_vec(bvl, bip, iter) \
|
||||
@ -72,7 +78,8 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
|
||||
unsigned int nr);
|
||||
int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
|
||||
unsigned int offset);
|
||||
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len);
|
||||
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter);
|
||||
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta);
|
||||
void bio_integrity_unmap_user(struct bio *bio);
|
||||
bool bio_integrity_prep(struct bio *bio);
|
||||
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
|
||||
@ -98,8 +105,12 @@ static inline void bioset_integrity_free(struct bio_set *bs)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
|
||||
ssize_t len)
|
||||
static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
@ -349,6 +349,7 @@ struct readahead_control;
|
||||
#define IOCB_DIO_CALLER_COMP (1 << 22)
|
||||
/* kiocb is a read or write operation submitted by fs/aio.c. */
|
||||
#define IOCB_AIO_RW (1 << 23)
|
||||
#define IOCB_HAS_METADATA (1 << 24)
|
||||
|
||||
/* for use in trace events */
|
||||
#define TRACE_IOCB_STRINGS \
|
||||
|
@ -78,8 +78,9 @@ struct io_hash_table {
|
||||
|
||||
struct io_mapped_region {
|
||||
struct page **pages;
|
||||
void *vmap_ptr;
|
||||
size_t nr_pages;
|
||||
void *ptr;
|
||||
unsigned nr_pages;
|
||||
unsigned flags;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -293,6 +294,11 @@ struct io_ring_ctx {
|
||||
|
||||
struct io_submit_state submit_state;
|
||||
|
||||
/*
|
||||
* Modifications are protected by ->uring_lock and ->mmap_lock.
|
||||
* The flags, buf_pages and buf_nr_pages fields should be stable
|
||||
* once published.
|
||||
*/
|
||||
struct xarray io_bl_xa;
|
||||
|
||||
struct io_hash_table cancel_table;
|
||||
@ -424,17 +430,10 @@ struct io_ring_ctx {
|
||||
* side will need to grab this lock, to prevent either side from
|
||||
* being run concurrently with the other.
|
||||
*/
|
||||
struct mutex resize_lock;
|
||||
|
||||
/*
|
||||
* If IORING_SETUP_NO_MMAP is used, then the below holds
|
||||
* the gup'ed pages for the two rings, and the sqes.
|
||||
*/
|
||||
unsigned short n_ring_pages;
|
||||
unsigned short n_sqe_pages;
|
||||
struct page **ring_pages;
|
||||
struct page **sqe_pages;
|
||||
struct mutex mmap_lock;
|
||||
|
||||
struct io_mapped_region sq_region;
|
||||
struct io_mapped_region ring_region;
|
||||
/* used for optimised request parameter and wait argument passing */
|
||||
struct io_mapped_region param_region;
|
||||
};
|
||||
@ -481,6 +480,7 @@ enum {
|
||||
REQ_F_BL_NO_RECYCLE_BIT,
|
||||
REQ_F_BUFFERS_COMMIT_BIT,
|
||||
REQ_F_BUF_NODE_BIT,
|
||||
REQ_F_HAS_METADATA_BIT,
|
||||
|
||||
/* not a real bit, just to check we're not overflowing the space */
|
||||
__REQ_F_LAST_BIT,
|
||||
@ -561,6 +561,8 @@ enum {
|
||||
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
|
||||
/* buf node is valid */
|
||||
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
|
||||
/* request has read/write metadata assigned */
|
||||
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
|
||||
};
|
||||
|
||||
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
|
||||
|
@ -82,6 +82,15 @@ struct iov_iter {
|
||||
};
|
||||
};
|
||||
|
||||
typedef __u16 uio_meta_flags_t;
|
||||
|
||||
struct uio_meta {
|
||||
uio_meta_flags_t flags;
|
||||
u16 app_tag;
|
||||
u64 seed;
|
||||
struct iov_iter iter;
|
||||
};
|
||||
|
||||
static inline const struct iovec *iter_iov(const struct iov_iter *iter)
|
||||
{
|
||||
if (iter->iter_type == ITER_UBUF)
|
||||
|
@ -40,6 +40,15 @@
|
||||
#define BLOCK_SIZE_BITS 10
|
||||
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
|
||||
|
||||
/* flags for integrity meta */
|
||||
#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
|
||||
#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
|
||||
#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
|
||||
|
||||
#define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \
|
||||
IO_INTEGRITY_CHK_REFTAG | \
|
||||
IO_INTEGRITY_CHK_APPTAG)
|
||||
|
||||
#define SEEK_SET 0 /* seek relative to beginning of file */
|
||||
#define SEEK_CUR 1 /* seek relative to current file position */
|
||||
#define SEEK_END 2 /* seek relative to end of file */
|
||||
|
@ -98,6 +98,10 @@ struct io_uring_sqe {
|
||||
__u64 addr3;
|
||||
__u64 __pad2[1];
|
||||
};
|
||||
struct {
|
||||
__u64 attr_ptr; /* pointer to attribute information */
|
||||
__u64 attr_type_mask; /* bit mask of attributes */
|
||||
};
|
||||
__u64 optval;
|
||||
/*
|
||||
* If the ring is initialized with IORING_SETUP_SQE128, then
|
||||
@ -107,6 +111,18 @@ struct io_uring_sqe {
|
||||
};
|
||||
};
|
||||
|
||||
/* sqe->attr_type_mask flags */
|
||||
#define IORING_RW_ATTR_FLAG_PI (1U << 0)
|
||||
/* PI attribute information */
|
||||
struct io_uring_attr_pi {
|
||||
__u16 flags;
|
||||
__u16 app_tag;
|
||||
__u32 len;
|
||||
__u64 addr;
|
||||
__u64 seed;
|
||||
__u64 rsvd;
|
||||
};
|
||||
|
||||
/*
|
||||
* If sqe->file_index is set to this for opcodes that instantiate a new
|
||||
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
|
||||
@ -561,6 +577,7 @@ struct io_uring_params {
|
||||
#define IORING_FEAT_REG_REG_RING (1U << 13)
|
||||
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
|
||||
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
|
||||
#define IORING_FEAT_RW_ATTR (1U << 16)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
|
@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp,
|
||||
void (*init_once)(void *obj))
|
||||
{
|
||||
if (unlikely(!cache->nr_cached)) {
|
||||
void *obj = kmalloc(cache->elem_size, gfp);
|
||||
|
||||
if (obj && init_once)
|
||||
init_once(obj);
|
||||
return obj;
|
||||
}
|
||||
return io_alloc_cache_get(cache);
|
||||
}
|
||||
|
||||
/* returns false if the cache was initialized properly */
|
||||
static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
|
||||
unsigned max_nr, size_t size)
|
||||
|
@ -211,10 +211,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
|
||||
|
||||
if (ctx->file_table.data.nodes[i])
|
||||
f = io_slot_file(ctx->file_table.data.nodes[i]);
|
||||
if (f)
|
||||
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
|
||||
else
|
||||
seq_printf(m, "%5u: <none>\n", i);
|
||||
if (f) {
|
||||
seq_printf(m, "%5u: ", i);
|
||||
seq_file_path(m, f, " \t\n\\");
|
||||
seq_puts(m, "\n");
|
||||
}
|
||||
}
|
||||
seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
|
||||
for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
|
||||
|
@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_futex_data *ifd;
|
||||
|
||||
ifd = io_alloc_cache_get(&ctx->futex_cache);
|
||||
if (ifd)
|
||||
return ifd;
|
||||
|
||||
return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
|
||||
}
|
||||
|
||||
int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
|
||||
@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
|
||||
}
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
ifd = io_alloc_ifd(ctx);
|
||||
ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL);
|
||||
if (!ifd) {
|
||||
ret = -ENOMEM;
|
||||
goto done_unlock;
|
||||
|
@ -115,7 +115,7 @@
|
||||
REQ_F_ASYNC_DATA)
|
||||
|
||||
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
|
||||
IO_REQ_CLEAN_FLAGS)
|
||||
REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
|
||||
|
||||
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
|
||||
|
||||
@ -143,7 +143,8 @@ struct io_defer_entry {
|
||||
|
||||
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
|
||||
struct io_uring_task *tctx,
|
||||
bool cancel_all);
|
||||
bool cancel_all,
|
||||
bool is_sqpoll_thread);
|
||||
|
||||
static void io_queue_sqe(struct io_kiocb *req);
|
||||
|
||||
@ -350,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
||||
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
|
||||
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
|
||||
io_napi_init(ctx);
|
||||
mutex_init(&ctx->resize_lock);
|
||||
mutex_init(&ctx->mmap_lock);
|
||||
|
||||
return ctx;
|
||||
|
||||
@ -361,7 +362,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
||||
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
|
||||
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
|
||||
io_alloc_cache_free(&ctx->uring_cache, kfree);
|
||||
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
|
||||
io_alloc_cache_free(&ctx->msg_cache, kfree);
|
||||
io_futex_cache_free(ctx);
|
||||
kvfree(ctx->cancel_table.hbs);
|
||||
xa_destroy(&ctx->io_bl_xa);
|
||||
@ -550,8 +551,9 @@ void io_req_queue_iowq(struct io_kiocb *req)
|
||||
io_req_task_work_add(req);
|
||||
}
|
||||
|
||||
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
|
||||
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
|
||||
{
|
||||
spin_lock(&ctx->completion_lock);
|
||||
while (!list_empty(&ctx->defer_list)) {
|
||||
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
|
||||
struct io_defer_entry, list);
|
||||
@ -562,6 +564,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
|
||||
io_req_task_queue(de->req);
|
||||
kfree(de);
|
||||
}
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
||||
@ -570,11 +573,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
||||
io_poll_wq_wake(ctx);
|
||||
if (ctx->off_timeout_used)
|
||||
io_flush_timeouts(ctx);
|
||||
if (ctx->drain_active) {
|
||||
spin_lock(&ctx->completion_lock);
|
||||
if (ctx->drain_active)
|
||||
io_queue_deferred(ctx);
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
}
|
||||
if (ctx->has_evfd)
|
||||
io_eventfd_flush_signal(ctx);
|
||||
}
|
||||
@ -1401,6 +1401,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
|
||||
comp_list);
|
||||
|
||||
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
|
||||
if (req->flags & REQ_F_REISSUE) {
|
||||
node = req->comp_list.next;
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
io_queue_iowq(req);
|
||||
continue;
|
||||
}
|
||||
if (req->flags & REQ_F_REFCOUNT) {
|
||||
node = req->comp_list.next;
|
||||
if (!req_ref_put_and_test(req))
|
||||
@ -1440,7 +1446,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
|
||||
struct io_kiocb *req = container_of(node, struct io_kiocb,
|
||||
comp_list);
|
||||
|
||||
if (!(req->flags & REQ_F_CQE_SKIP) &&
|
||||
/*
|
||||
* Requests marked with REQUEUE should not post a CQE, they
|
||||
* will go through the io-wq retry machinery and post one
|
||||
* later.
|
||||
*/
|
||||
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
|
||||
unlikely(!io_fill_cqe_req(ctx, req))) {
|
||||
if (ctx->lockless_cq) {
|
||||
spin_lock(&ctx->completion_lock);
|
||||
@ -1640,19 +1651,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
|
||||
return res;
|
||||
}
|
||||
|
||||
bool io_alloc_async_data(struct io_kiocb *req)
|
||||
{
|
||||
const struct io_issue_def *def = &io_issue_defs[req->opcode];
|
||||
|
||||
WARN_ON_ONCE(!def->async_size);
|
||||
req->async_data = kmalloc(def->async_size, GFP_KERNEL);
|
||||
if (req->async_data) {
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static u32 io_get_sequence(struct io_kiocb *req)
|
||||
{
|
||||
u32 seq = req->ctx->cached_sq_head;
|
||||
@ -2631,36 +2629,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
|
||||
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
|
||||
}
|
||||
|
||||
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
|
||||
size_t size)
|
||||
{
|
||||
return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
|
||||
size);
|
||||
}
|
||||
|
||||
static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
|
||||
size_t size)
|
||||
{
|
||||
return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
|
||||
size);
|
||||
}
|
||||
|
||||
static void io_rings_free(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
|
||||
io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
|
||||
true);
|
||||
io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
|
||||
true);
|
||||
} else {
|
||||
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
|
||||
ctx->n_ring_pages = 0;
|
||||
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
|
||||
ctx->n_sqe_pages = 0;
|
||||
vunmap(ctx->rings);
|
||||
vunmap(ctx->sq_sqes);
|
||||
}
|
||||
|
||||
io_free_region(ctx, &ctx->sq_region);
|
||||
io_free_region(ctx, &ctx->ring_region);
|
||||
ctx->rings = NULL;
|
||||
ctx->sq_sqes = NULL;
|
||||
}
|
||||
@ -2732,7 +2704,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
||||
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
|
||||
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
|
||||
io_alloc_cache_free(&ctx->uring_cache, kfree);
|
||||
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
|
||||
io_alloc_cache_free(&ctx->msg_cache, kfree);
|
||||
io_futex_cache_free(ctx);
|
||||
io_destroy_buffers(ctx);
|
||||
io_free_region(ctx, &ctx->param_region);
|
||||
@ -2894,7 +2866,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
|
||||
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
|
||||
io_move_task_work_from_local(ctx);
|
||||
|
||||
while (io_uring_try_cancel_requests(ctx, NULL, true))
|
||||
/* The SQPOLL thread never reaches this path */
|
||||
while (io_uring_try_cancel_requests(ctx, NULL, true, false))
|
||||
cond_resched();
|
||||
|
||||
if (ctx->sq_data) {
|
||||
@ -3062,7 +3035,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
|
||||
|
||||
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
|
||||
struct io_uring_task *tctx,
|
||||
bool cancel_all)
|
||||
bool cancel_all,
|
||||
bool is_sqpoll_thread)
|
||||
{
|
||||
struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, };
|
||||
enum io_wq_cancel cret;
|
||||
@ -3092,7 +3066,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
|
||||
|
||||
/* SQPOLL thread does its own polling */
|
||||
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
|
||||
(ctx->sq_data && ctx->sq_data->thread == current)) {
|
||||
is_sqpoll_thread) {
|
||||
while (!wq_list_empty(&ctx->iopoll_list)) {
|
||||
io_iopoll_try_reap_events(ctx);
|
||||
ret = true;
|
||||
@ -3165,13 +3139,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
|
||||
continue;
|
||||
loop |= io_uring_try_cancel_requests(node->ctx,
|
||||
current->io_uring,
|
||||
cancel_all);
|
||||
cancel_all,
|
||||
false);
|
||||
}
|
||||
} else {
|
||||
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||
loop |= io_uring_try_cancel_requests(ctx,
|
||||
current->io_uring,
|
||||
cancel_all);
|
||||
cancel_all,
|
||||
true);
|
||||
}
|
||||
|
||||
if (loop) {
|
||||
@ -3233,6 +3209,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
|
||||
end > ctx->cq_wait_size))
|
||||
return ERR_PTR(-EFAULT);
|
||||
|
||||
offset = array_index_nospec(offset, ctx->cq_wait_size - size);
|
||||
return ctx->cq_wait_arg + offset;
|
||||
}
|
||||
|
||||
@ -3477,9 +3454,10 @@ bool io_is_uring_fops(struct file *file)
|
||||
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
struct io_uring_params *p)
|
||||
{
|
||||
struct io_uring_region_desc rd;
|
||||
struct io_rings *rings;
|
||||
size_t size, sq_array_offset;
|
||||
void *ptr;
|
||||
int ret;
|
||||
|
||||
/* make sure these are sane, as we already accounted them */
|
||||
ctx->sq_entries = p->sq_entries;
|
||||
@ -3490,15 +3468,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
if (size == SIZE_MAX)
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
|
||||
rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
|
||||
else
|
||||
rings = io_rings_map(ctx, p->cq_off.user_addr, size);
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
rd.size = PAGE_ALIGN(size);
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP) {
|
||||
rd.user_addr = p->cq_off.user_addr;
|
||||
rd.flags |= IORING_MEM_REGION_TYPE_USER;
|
||||
}
|
||||
ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
|
||||
if (ret)
|
||||
return ret;
|
||||
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
|
||||
|
||||
if (IS_ERR(rings))
|
||||
return PTR_ERR(rings);
|
||||
|
||||
ctx->rings = rings;
|
||||
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
|
||||
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
|
||||
rings->sq_ring_mask = p->sq_entries - 1;
|
||||
@ -3515,17 +3495,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
|
||||
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
|
||||
ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
|
||||
else
|
||||
ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
|
||||
|
||||
if (IS_ERR(ptr)) {
|
||||
io_rings_free(ctx);
|
||||
return PTR_ERR(ptr);
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
rd.size = PAGE_ALIGN(size);
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP) {
|
||||
rd.user_addr = p->sq_off.user_addr;
|
||||
rd.flags |= IORING_MEM_REGION_TYPE_USER;
|
||||
}
|
||||
|
||||
ctx->sq_sqes = ptr;
|
||||
ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
|
||||
if (ret) {
|
||||
io_rings_free(ctx);
|
||||
return ret;
|
||||
}
|
||||
ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3733,7 +3714,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
|
||||
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
|
||||
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
|
||||
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
|
||||
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
|
||||
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
|
||||
IORING_FEAT_RW_ATTR;
|
||||
|
||||
if (copy_to_user(params, p, sizeof(*p))) {
|
||||
ret = -EFAULT;
|
||||
@ -3894,6 +3876,8 @@ static int __init io_uring_init(void)
|
||||
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
|
||||
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
|
||||
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
|
||||
BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
|
||||
BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
|
||||
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
|
||||
|
@ -8,9 +8,11 @@
|
||||
#include <linux/poll.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
#include <uapi/linux/eventpoll.h>
|
||||
#include "alloc_cache.h"
|
||||
#include "io-wq.h"
|
||||
#include "slist.h"
|
||||
#include "filetable.h"
|
||||
#include "opdef.h"
|
||||
|
||||
#ifndef CREATE_TRACE_POINTS
|
||||
#include <trace/events/io_uring.h>
|
||||
@ -223,6 +225,27 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
|
||||
req->cqe.flags = cflags;
|
||||
}
|
||||
|
||||
static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
|
||||
struct io_kiocb *req,
|
||||
void (*init_once)(void *obj))
|
||||
{
|
||||
req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once);
|
||||
if (req->async_data)
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
return req->async_data;
|
||||
}
|
||||
|
||||
static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req)
|
||||
{
|
||||
const struct io_issue_def *def = &io_issue_defs[req->opcode];
|
||||
|
||||
WARN_ON_ONCE(!def->async_size);
|
||||
req->async_data = kmalloc(def->async_size, GFP_KERNEL);
|
||||
if (req->async_data)
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
return req->async_data;
|
||||
}
|
||||
|
||||
static inline bool req_has_async_data(struct io_kiocb *req)
|
||||
{
|
||||
return req->flags & REQ_F_ASYNC_DATA;
|
||||
|
228
io_uring/kbuf.c
228
io_uring/kbuf.c
@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
|
||||
/*
|
||||
* Store buffer group ID and finally mark the list as visible.
|
||||
* The normal lookup doesn't care about the visibility as we're
|
||||
* always under the ->uring_lock, but the RCU lookup from mmap does.
|
||||
* always under the ->uring_lock, but lookups from mmap do.
|
||||
*/
|
||||
bl->bgid = bgid;
|
||||
atomic_set(&bl->refs, 1);
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
|
||||
}
|
||||
|
||||
@ -353,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
|
||||
if (bl->flags & IOBL_BUF_RING) {
|
||||
i = bl->buf_ring->tail - bl->head;
|
||||
if (bl->buf_nr_pages) {
|
||||
int j;
|
||||
|
||||
if (!(bl->flags & IOBL_MMAP)) {
|
||||
for (j = 0; j < bl->buf_nr_pages; j++)
|
||||
unpin_user_page(bl->buf_pages[j]);
|
||||
}
|
||||
io_pages_unmap(bl->buf_ring, &bl->buf_pages,
|
||||
&bl->buf_nr_pages, bl->flags & IOBL_MMAP);
|
||||
bl->flags &= ~IOBL_MMAP;
|
||||
}
|
||||
io_free_region(ctx, &bl->region);
|
||||
/* make sure it's seen as empty */
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
bl->flags &= ~IOBL_BUF_RING;
|
||||
@ -386,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
|
||||
return i;
|
||||
}
|
||||
|
||||
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
|
||||
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
|
||||
{
|
||||
if (atomic_dec_and_test(&bl->refs)) {
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
kfree_rcu(bl, rcu);
|
||||
}
|
||||
__io_remove_buffers(ctx, bl, -1U);
|
||||
kfree(bl);
|
||||
}
|
||||
|
||||
void io_destroy_buffers(struct io_ring_ctx *ctx)
|
||||
@ -399,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
|
||||
struct io_buffer_list *bl;
|
||||
struct list_head *item, *tmp;
|
||||
struct io_buffer *buf;
|
||||
unsigned long index;
|
||||
|
||||
xa_for_each(&ctx->io_bl_xa, index, bl) {
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
while (1) {
|
||||
unsigned long index = 0;
|
||||
|
||||
scoped_guard(mutex, &ctx->mmap_lock) {
|
||||
bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
|
||||
if (bl)
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
}
|
||||
if (!bl)
|
||||
break;
|
||||
io_put_bl(ctx, bl);
|
||||
}
|
||||
|
||||
@ -591,11 +586,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
INIT_LIST_HEAD(&bl->buf_list);
|
||||
ret = io_buffer_add_list(ctx, bl, p->bgid);
|
||||
if (ret) {
|
||||
/*
|
||||
* Doesn't need rcu free as it was never visible, but
|
||||
* let's keep it consistent throughout.
|
||||
*/
|
||||
kfree_rcu(bl, rcu);
|
||||
kfree(bl);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
@ -615,75 +606,14 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
|
||||
struct io_buffer_list *bl)
|
||||
{
|
||||
struct io_uring_buf_ring *br = NULL;
|
||||
struct page **pages;
|
||||
int nr_pages, ret;
|
||||
|
||||
pages = io_pin_pages(reg->ring_addr,
|
||||
flex_array_size(br, bufs, reg->ring_entries),
|
||||
&nr_pages);
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
|
||||
br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (!br) {
|
||||
ret = -ENOMEM;
|
||||
goto error_unpin;
|
||||
}
|
||||
|
||||
#ifdef SHM_COLOUR
|
||||
/*
|
||||
* On platforms that have specific aliasing requirements, SHM_COLOUR
|
||||
* is set and we must guarantee that the kernel and user side align
|
||||
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
|
||||
* the application mmap's the provided ring buffer. Fail the request
|
||||
* if we, by chance, don't end up with aligned addresses. The app
|
||||
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
|
||||
* this transparently.
|
||||
*/
|
||||
if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
|
||||
ret = -EINVAL;
|
||||
goto error_unpin;
|
||||
}
|
||||
#endif
|
||||
bl->buf_pages = pages;
|
||||
bl->buf_nr_pages = nr_pages;
|
||||
bl->buf_ring = br;
|
||||
bl->flags |= IOBL_BUF_RING;
|
||||
bl->flags &= ~IOBL_MMAP;
|
||||
return 0;
|
||||
error_unpin:
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
kvfree(pages);
|
||||
vunmap(br);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
|
||||
struct io_uring_buf_reg *reg,
|
||||
struct io_buffer_list *bl)
|
||||
{
|
||||
size_t ring_size;
|
||||
|
||||
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
|
||||
|
||||
bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
|
||||
if (IS_ERR(bl->buf_ring)) {
|
||||
bl->buf_ring = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_uring_buf_reg reg;
|
||||
struct io_buffer_list *bl, *free_bl = NULL;
|
||||
struct io_uring_region_desc rd;
|
||||
struct io_uring_buf_ring *br;
|
||||
unsigned long mmap_offset;
|
||||
unsigned long ring_size;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
@ -695,19 +625,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return -EINVAL;
|
||||
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
|
||||
return -EINVAL;
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
|
||||
if (!reg.ring_addr)
|
||||
return -EFAULT;
|
||||
if (reg.ring_addr & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
if (reg.ring_addr)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!is_power_of_2(reg.ring_entries))
|
||||
return -EINVAL;
|
||||
|
||||
/* cannot disambiguate full vs empty due to head/tail size */
|
||||
if (reg.ring_entries >= 65536)
|
||||
return -EINVAL;
|
||||
@ -723,22 +642,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP))
|
||||
ret = io_pin_pbuf_ring(®, bl);
|
||||
else
|
||||
ret = io_alloc_pbuf_ring(ctx, ®, bl);
|
||||
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
|
||||
ring_size = flex_array_size(br, bufs, reg.ring_entries);
|
||||
|
||||
if (!ret) {
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
bl->mask = reg.ring_entries - 1;
|
||||
if (reg.flags & IOU_PBUF_RING_INC)
|
||||
bl->flags |= IOBL_INC;
|
||||
|
||||
io_buffer_add_list(ctx, bl, reg.bgid);
|
||||
return 0;
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
rd.size = PAGE_ALIGN(ring_size);
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
|
||||
rd.user_addr = reg.ring_addr;
|
||||
rd.flags |= IORING_MEM_REGION_TYPE_USER;
|
||||
}
|
||||
ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
|
||||
if (ret)
|
||||
goto fail;
|
||||
br = io_region_get_ptr(&bl->region);
|
||||
|
||||
kfree_rcu(free_bl, rcu);
|
||||
#ifdef SHM_COLOUR
|
||||
/*
|
||||
* On platforms that have specific aliasing requirements, SHM_COLOUR
|
||||
* is set and we must guarantee that the kernel and user side align
|
||||
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
|
||||
* the application mmap's the provided ring buffer. Fail the request
|
||||
* if we, by chance, don't end up with aligned addresses. The app
|
||||
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
|
||||
* this transparently.
|
||||
*/
|
||||
if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
|
||||
((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
|
||||
ret = -EINVAL;
|
||||
goto fail;
|
||||
}
|
||||
#endif
|
||||
|
||||
bl->nr_entries = reg.ring_entries;
|
||||
bl->mask = reg.ring_entries - 1;
|
||||
bl->flags |= IOBL_BUF_RING;
|
||||
bl->buf_ring = br;
|
||||
if (reg.flags & IOU_PBUF_RING_INC)
|
||||
bl->flags |= IOBL_INC;
|
||||
io_buffer_add_list(ctx, bl, reg.bgid);
|
||||
return 0;
|
||||
fail:
|
||||
io_free_region(ctx, &bl->region);
|
||||
kfree(free_bl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -762,7 +707,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (!(bl->flags & IOBL_BUF_RING))
|
||||
return -EINVAL;
|
||||
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
scoped_guard(mutex, &ctx->mmap_lock)
|
||||
xa_erase(&ctx->io_bl_xa, bl->bgid);
|
||||
|
||||
io_put_bl(ctx, bl);
|
||||
return 0;
|
||||
}
|
||||
@ -793,50 +740,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
|
||||
unsigned long bgid)
|
||||
struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
|
||||
unsigned int bgid)
|
||||
{
|
||||
struct io_buffer_list *bl;
|
||||
bool ret;
|
||||
|
||||
/*
|
||||
* We have to be a bit careful here - we're inside mmap and cannot grab
|
||||
* the uring_lock. This means the buffer_list could be simultaneously
|
||||
* going away, if someone is trying to be sneaky. Look it up under rcu
|
||||
* so we know it's not going away, and attempt to grab a reference to
|
||||
* it. If the ref is already zero, then fail the mapping. If successful,
|
||||
* the caller will call io_put_bl() to drop the the reference at at the
|
||||
* end. This may then safely free the buffer_list (and drop the pages)
|
||||
* at that point, vm_insert_pages() would've already grabbed the
|
||||
* necessary vma references.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
lockdep_assert_held(&ctx->mmap_lock);
|
||||
|
||||
bl = xa_load(&ctx->io_bl_xa, bgid);
|
||||
/* must be a mmap'able buffer ring and have pages */
|
||||
ret = false;
|
||||
if (bl && bl->flags & IOBL_MMAP)
|
||||
ret = atomic_inc_not_zero(&bl->refs);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (ret)
|
||||
return bl;
|
||||
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
|
||||
struct io_buffer_list *bl;
|
||||
int bgid, ret;
|
||||
|
||||
bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
||||
bl = io_pbuf_get_bl(ctx, bgid);
|
||||
if (IS_ERR(bl))
|
||||
return PTR_ERR(bl);
|
||||
|
||||
ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
|
||||
io_put_bl(ctx, bl);
|
||||
return ret;
|
||||
if (!bl || !(bl->flags & IOBL_BUF_RING))
|
||||
return NULL;
|
||||
return &bl->region;
|
||||
}
|
||||
|
@ -3,15 +3,13 @@
|
||||
#define IOU_KBUF_H
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
|
||||
enum {
|
||||
/* ring mapped provided buffers */
|
||||
IOBL_BUF_RING = 1,
|
||||
/* ring mapped provided buffers, but mmap'ed by application */
|
||||
IOBL_MMAP = 2,
|
||||
/* buffers are consumed incrementally rather than always fully */
|
||||
IOBL_INC = 4,
|
||||
|
||||
IOBL_INC = 2,
|
||||
};
|
||||
|
||||
struct io_buffer_list {
|
||||
@ -21,11 +19,7 @@ struct io_buffer_list {
|
||||
*/
|
||||
union {
|
||||
struct list_head buf_list;
|
||||
struct {
|
||||
struct page **buf_pages;
|
||||
struct io_uring_buf_ring *buf_ring;
|
||||
};
|
||||
struct rcu_head rcu;
|
||||
struct io_uring_buf_ring *buf_ring;
|
||||
};
|
||||
__u16 bgid;
|
||||
|
||||
@ -37,7 +31,7 @@ struct io_buffer_list {
|
||||
|
||||
__u16 flags;
|
||||
|
||||
atomic_t refs;
|
||||
struct io_mapped_region region;
|
||||
};
|
||||
|
||||
struct io_buffer {
|
||||
@ -84,10 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
|
||||
|
||||
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
|
||||
|
||||
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
|
||||
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
|
||||
unsigned long bgid);
|
||||
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
|
||||
unsigned int bgid);
|
||||
|
||||
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
|
||||
{
|
||||
|
@ -36,102 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
|
||||
return page_address(page);
|
||||
}
|
||||
|
||||
static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
|
||||
gfp_t gfp)
|
||||
{
|
||||
void *ret;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
pages[i] = alloc_page(gfp);
|
||||
if (!pages[i])
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (ret)
|
||||
return ret;
|
||||
err:
|
||||
while (i--)
|
||||
put_page(pages[i]);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
|
||||
size_t size)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
|
||||
struct page **pages;
|
||||
int nr_pages;
|
||||
void *ret;
|
||||
|
||||
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
|
||||
if (!pages)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
|
||||
if (!IS_ERR(ret))
|
||||
goto done;
|
||||
if (nr_pages == 1)
|
||||
goto fail;
|
||||
|
||||
ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
|
||||
if (!IS_ERR(ret)) {
|
||||
done:
|
||||
*out_pages = pages;
|
||||
*npages = nr_pages;
|
||||
return ret;
|
||||
}
|
||||
fail:
|
||||
kvfree(pages);
|
||||
*out_pages = NULL;
|
||||
*npages = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
|
||||
bool put_pages)
|
||||
{
|
||||
bool do_vunmap = false;
|
||||
|
||||
if (!ptr)
|
||||
return;
|
||||
|
||||
if (put_pages && *npages) {
|
||||
struct page **to_free = *pages;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Only did vmap for the non-compound multiple page case.
|
||||
* For the compound page, we just need to put the head.
|
||||
*/
|
||||
if (PageCompound(to_free[0]))
|
||||
*npages = 1;
|
||||
else if (*npages > 1)
|
||||
do_vunmap = true;
|
||||
for (i = 0; i < *npages; i++)
|
||||
put_page(to_free[i]);
|
||||
}
|
||||
if (do_vunmap)
|
||||
vunmap(ptr);
|
||||
kvfree(*pages);
|
||||
*pages = NULL;
|
||||
*npages = 0;
|
||||
}
|
||||
|
||||
void io_pages_free(struct page ***pages, int npages)
|
||||
{
|
||||
struct page **page_array = *pages;
|
||||
|
||||
if (!page_array)
|
||||
return;
|
||||
|
||||
unpin_user_pages(page_array, npages);
|
||||
kvfree(page_array);
|
||||
*pages = NULL;
|
||||
}
|
||||
|
||||
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
|
||||
{
|
||||
unsigned long start, end, nr_pages;
|
||||
@ -174,64 +78,127 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
|
||||
unsigned long uaddr, size_t size)
|
||||
{
|
||||
struct page **page_array;
|
||||
unsigned int nr_pages;
|
||||
void *page_addr;
|
||||
|
||||
*npages = 0;
|
||||
|
||||
if (uaddr & (PAGE_SIZE - 1) || !size)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
nr_pages = 0;
|
||||
page_array = io_pin_pages(uaddr, size, &nr_pages);
|
||||
if (IS_ERR(page_array))
|
||||
return page_array;
|
||||
|
||||
page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (page_addr) {
|
||||
*pages = page_array;
|
||||
*npages = nr_pages;
|
||||
return page_addr;
|
||||
}
|
||||
|
||||
io_pages_free(&page_array, nr_pages);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
enum {
|
||||
/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
|
||||
IO_REGION_F_VMAP = 1,
|
||||
/* memory is provided by user and pinned by the kernel */
|
||||
IO_REGION_F_USER_PROVIDED = 2,
|
||||
/* only the first page in the array is ref'ed */
|
||||
IO_REGION_F_SINGLE_REF = 4,
|
||||
};
|
||||
|
||||
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
|
||||
{
|
||||
if (mr->pages) {
|
||||
unpin_user_pages(mr->pages, mr->nr_pages);
|
||||
long nr_refs = mr->nr_pages;
|
||||
|
||||
if (mr->flags & IO_REGION_F_SINGLE_REF)
|
||||
nr_refs = 1;
|
||||
|
||||
if (mr->flags & IO_REGION_F_USER_PROVIDED)
|
||||
unpin_user_pages(mr->pages, nr_refs);
|
||||
else
|
||||
release_pages(mr->pages, nr_refs);
|
||||
|
||||
kvfree(mr->pages);
|
||||
}
|
||||
if (mr->vmap_ptr)
|
||||
vunmap(mr->vmap_ptr);
|
||||
if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
|
||||
vunmap(mr->ptr);
|
||||
if (mr->nr_pages && ctx->user)
|
||||
__io_unaccount_mem(ctx->user, mr->nr_pages);
|
||||
|
||||
memset(mr, 0, sizeof(*mr));
|
||||
}
|
||||
|
||||
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg)
|
||||
static int io_region_init_ptr(struct io_mapped_region *mr)
|
||||
{
|
||||
int pages_accounted = 0;
|
||||
struct io_imu_folio_data ifd;
|
||||
void *ptr;
|
||||
|
||||
if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
|
||||
if (ifd.nr_folios == 1) {
|
||||
mr->ptr = page_address(mr->pages[0]);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (!ptr)
|
||||
return -ENOMEM;
|
||||
|
||||
mr->ptr = ptr;
|
||||
mr->flags |= IO_REGION_F_VMAP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_region_pin_pages(struct io_ring_ctx *ctx,
|
||||
struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg)
|
||||
{
|
||||
unsigned long size = mr->nr_pages << PAGE_SHIFT;
|
||||
struct page **pages;
|
||||
int nr_pages;
|
||||
|
||||
pages = io_pin_pages(reg->user_addr, size, &nr_pages);
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
if (WARN_ON_ONCE(nr_pages != mr->nr_pages))
|
||||
return -EFAULT;
|
||||
|
||||
mr->pages = pages;
|
||||
mr->flags |= IO_REGION_F_USER_PROVIDED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_region_allocate_pages(struct io_ring_ctx *ctx,
|
||||
struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg,
|
||||
unsigned long mmap_offset)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
|
||||
unsigned long size = mr->nr_pages << PAGE_SHIFT;
|
||||
unsigned long nr_allocated;
|
||||
struct page **pages;
|
||||
void *p;
|
||||
|
||||
pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
|
||||
if (!pages)
|
||||
return -ENOMEM;
|
||||
|
||||
p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp);
|
||||
if (!IS_ERR(p)) {
|
||||
mr->flags |= IO_REGION_F_SINGLE_REF;
|
||||
goto done;
|
||||
}
|
||||
|
||||
nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE,
|
||||
mr->nr_pages, pages);
|
||||
if (nr_allocated != mr->nr_pages) {
|
||||
if (nr_allocated)
|
||||
release_pages(pages, nr_allocated);
|
||||
kvfree(pages);
|
||||
return -ENOMEM;
|
||||
}
|
||||
done:
|
||||
reg->mmap_offset = mmap_offset;
|
||||
mr->pages = pages;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg,
|
||||
unsigned long mmap_offset)
|
||||
{
|
||||
int nr_pages, ret;
|
||||
void *vptr;
|
||||
u64 end;
|
||||
|
||||
if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
|
||||
if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
|
||||
return -EFAULT;
|
||||
if (memchr_inv(®->__resv, 0, sizeof(reg->__resv)))
|
||||
return -EINVAL;
|
||||
if (reg->flags != IORING_MEM_REGION_TYPE_USER)
|
||||
if (reg->flags & ~IORING_MEM_REGION_TYPE_USER)
|
||||
return -EINVAL;
|
||||
if (!reg->user_addr)
|
||||
/* user_addr should be set IFF it's a user memory backed region */
|
||||
if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr)
|
||||
return -EFAULT;
|
||||
if (!reg->size || reg->mmap_offset || reg->id)
|
||||
return -EINVAL;
|
||||
@ -242,94 +209,120 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
||||
if (check_add_overflow(reg->user_addr, reg->size, &end))
|
||||
return -EOVERFLOW;
|
||||
|
||||
pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
|
||||
nr_pages = reg->size >> PAGE_SHIFT;
|
||||
if (ctx->user) {
|
||||
ret = __io_account_mem(ctx->user, nr_pages);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
pages_accounted = nr_pages;
|
||||
return ret;
|
||||
}
|
||||
|
||||
vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (!vptr) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
mr->pages = pages;
|
||||
mr->vmap_ptr = vptr;
|
||||
mr->nr_pages = nr_pages;
|
||||
|
||||
if (reg->flags & IORING_MEM_REGION_TYPE_USER)
|
||||
ret = io_region_pin_pages(ctx, mr, reg);
|
||||
else
|
||||
ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
|
||||
ret = io_region_init_ptr(mr);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
return 0;
|
||||
out_free:
|
||||
if (pages_accounted)
|
||||
__io_unaccount_mem(ctx->user, pages_accounted);
|
||||
io_pages_free(&pages, nr_pages);
|
||||
io_free_region(ctx, mr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg,
|
||||
unsigned long mmap_offset)
|
||||
{
|
||||
struct io_mapped_region tmp_mr;
|
||||
int ret;
|
||||
|
||||
memcpy(&tmp_mr, mr, sizeof(tmp_mr));
|
||||
ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Once published mmap can find it without holding only the ->mmap_lock
|
||||
* and not ->uring_lock.
|
||||
*/
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
memcpy(mr, &tmp_mr, sizeof(tmp_mr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
|
||||
loff_t pgoff)
|
||||
{
|
||||
loff_t offset = pgoff << PAGE_SHIFT;
|
||||
unsigned int bgid;
|
||||
|
||||
switch (offset & IORING_OFF_MMAP_MASK) {
|
||||
case IORING_OFF_SQ_RING:
|
||||
case IORING_OFF_CQ_RING:
|
||||
return &ctx->ring_region;
|
||||
case IORING_OFF_SQES:
|
||||
return &ctx->sq_region;
|
||||
case IORING_OFF_PBUF_RING:
|
||||
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
||||
return io_pbuf_get_region(ctx, bgid);
|
||||
case IORING_MAP_OFF_PARAM_REGION:
|
||||
return &ctx->param_region;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *io_region_validate_mmap(struct io_ring_ctx *ctx,
|
||||
struct io_mapped_region *mr)
|
||||
{
|
||||
lockdep_assert_held(&ctx->mmap_lock);
|
||||
|
||||
if (!io_region_is_set(mr))
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (mr->flags & IO_REGION_F_USER_PROVIDED)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
return io_region_get_ptr(mr);
|
||||
}
|
||||
|
||||
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
|
||||
size_t sz)
|
||||
{
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
loff_t offset = pgoff << PAGE_SHIFT;
|
||||
struct io_mapped_region *region;
|
||||
|
||||
switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
|
||||
case IORING_OFF_SQ_RING:
|
||||
case IORING_OFF_CQ_RING:
|
||||
/* Don't allow mmap if the ring was setup without it */
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (!ctx->rings)
|
||||
return ERR_PTR(-EFAULT);
|
||||
return ctx->rings;
|
||||
case IORING_OFF_SQES:
|
||||
/* Don't allow mmap if the ring was setup without it */
|
||||
if (ctx->flags & IORING_SETUP_NO_MMAP)
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (!ctx->sq_sqes)
|
||||
return ERR_PTR(-EFAULT);
|
||||
return ctx->sq_sqes;
|
||||
case IORING_OFF_PBUF_RING: {
|
||||
struct io_buffer_list *bl;
|
||||
unsigned int bgid;
|
||||
void *ptr;
|
||||
|
||||
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
|
||||
bl = io_pbuf_get_bl(ctx, bgid);
|
||||
if (IS_ERR(bl))
|
||||
return bl;
|
||||
ptr = bl->buf_ring;
|
||||
io_put_bl(ctx, bl);
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
|
||||
struct page **pages, int npages)
|
||||
{
|
||||
unsigned long nr_pages = npages;
|
||||
|
||||
vm_flags_set(vma, VM_DONTEXPAND);
|
||||
return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
|
||||
region = io_mmap_get_region(ctx, pgoff);
|
||||
if (!region)
|
||||
return ERR_PTR(-EINVAL);
|
||||
return io_region_validate_mmap(ctx, region);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
static int io_region_mmap(struct io_ring_ctx *ctx,
|
||||
struct io_mapped_region *mr,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned max_pages)
|
||||
{
|
||||
unsigned long nr_pages = min(mr->nr_pages, max_pages);
|
||||
|
||||
vm_flags_set(vma, VM_DONTEXPAND);
|
||||
return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages);
|
||||
}
|
||||
|
||||
__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
size_t sz = vma->vm_end - vma->vm_start;
|
||||
long offset = vma->vm_pgoff << PAGE_SHIFT;
|
||||
unsigned int npages;
|
||||
unsigned int page_limit = UINT_MAX;
|
||||
struct io_mapped_region *region;
|
||||
void *ptr;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
|
||||
if (IS_ERR(ptr))
|
||||
@ -338,16 +331,12 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
switch (offset & IORING_OFF_MMAP_MASK) {
|
||||
case IORING_OFF_SQ_RING:
|
||||
case IORING_OFF_CQ_RING:
|
||||
npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
|
||||
return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
|
||||
case IORING_OFF_SQES:
|
||||
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
|
||||
ctx->n_sqe_pages);
|
||||
case IORING_OFF_PBUF_RING:
|
||||
return io_pbuf_mmap(file, vma);
|
||||
page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
break;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
region = io_mmap_get_region(ctx, vma->vm_pgoff);
|
||||
return io_region_mmap(ctx, region, vma, page_limit);
|
||||
}
|
||||
|
||||
unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
@ -365,7 +354,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
if (addr)
|
||||
return -EINVAL;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
|
||||
if (IS_ERR(ptr))
|
||||
@ -415,7 +404,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
void *ptr;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
guard(mutex)(&ctx->mmap_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(file, pgoff, len);
|
||||
if (IS_ERR(ptr))
|
||||
|
@ -1,18 +1,9 @@
|
||||
#ifndef IO_URING_MEMMAP_H
|
||||
#define IO_URING_MEMMAP_H
|
||||
|
||||
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
|
||||
|
||||
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
|
||||
void io_pages_free(struct page ***pages, int npages);
|
||||
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
|
||||
struct page **pages, int npages);
|
||||
|
||||
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
|
||||
size_t size);
|
||||
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
|
||||
bool put_pages);
|
||||
|
||||
void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
|
||||
unsigned long uaddr, size_t size);
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
|
||||
@ -24,11 +15,17 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
|
||||
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
|
||||
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg);
|
||||
struct io_uring_region_desc *reg,
|
||||
unsigned long mmap_offset);
|
||||
|
||||
int io_create_region_mmap_safe(struct io_ring_ctx *ctx,
|
||||
struct io_mapped_region *mr,
|
||||
struct io_uring_region_desc *reg,
|
||||
unsigned long mmap_offset);
|
||||
|
||||
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
|
||||
{
|
||||
return mr->vmap_ptr;
|
||||
return mr->ptr;
|
||||
}
|
||||
|
||||
static inline bool io_region_is_set(struct io_mapped_region *mr)
|
||||
|
@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
|
||||
return __io_msg_ring_data(fd_file(f)->private_data,
|
||||
&io_msg, IO_URING_F_UNLOCKED);
|
||||
}
|
||||
|
||||
void io_msg_cache_free(const void *entry)
|
||||
{
|
||||
struct io_kiocb *req = (struct io_kiocb *) entry;
|
||||
|
||||
kmem_cache_free(req_cachep, req);
|
||||
}
|
||||
|
@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe);
|
||||
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
|
||||
void io_msg_ring_cleanup(struct io_kiocb *req);
|
||||
void io_msg_cache_free(const void *entry);
|
||||
|
@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
|
||||
}
|
||||
}
|
||||
|
||||
static void io_msg_async_data_init(void *obj)
|
||||
{
|
||||
struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj;
|
||||
|
||||
hdr->free_iov = NULL;
|
||||
hdr->free_iov_nr = 0;
|
||||
}
|
||||
|
||||
static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_async_msghdr *hdr;
|
||||
|
||||
hdr = io_alloc_cache_get(&ctx->netmsg_cache);
|
||||
if (hdr) {
|
||||
if (hdr->free_iov) {
|
||||
kasan_mempool_unpoison_object(hdr->free_iov,
|
||||
hdr->free_iov_nr * sizeof(struct iovec));
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
}
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
req->async_data = hdr;
|
||||
return hdr;
|
||||
}
|
||||
hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req,
|
||||
io_msg_async_data_init);
|
||||
if (!hdr)
|
||||
return NULL;
|
||||
|
||||
if (!io_alloc_async_data(req)) {
|
||||
hdr = req->async_data;
|
||||
hdr->free_iov_nr = 0;
|
||||
hdr->free_iov = NULL;
|
||||
return hdr;
|
||||
/* If the async data was cached, we might have an iov cached inside. */
|
||||
if (hdr->free_iov) {
|
||||
kasan_mempool_unpoison_object(hdr->free_iov,
|
||||
hdr->free_iov_nr * sizeof(struct iovec));
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
}
|
||||
return NULL;
|
||||
return hdr;
|
||||
}
|
||||
|
||||
/* assign new iovec to kmsg, if we need to */
|
||||
|
@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
|
||||
if (req->flags & REQ_F_POLLED) {
|
||||
apoll = req->apoll;
|
||||
kfree(apoll->double_poll);
|
||||
} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
|
||||
apoll = io_alloc_cache_get(&ctx->apoll_cache);
|
||||
if (!apoll)
|
||||
goto alloc_apoll;
|
||||
apoll->poll.retries = APOLL_MAX_RETRY;
|
||||
} else {
|
||||
alloc_apoll:
|
||||
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
|
||||
if (unlikely(!apoll))
|
||||
if (!(issue_flags & IO_URING_F_UNLOCKED))
|
||||
apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL);
|
||||
else
|
||||
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
|
||||
if (!apoll)
|
||||
return NULL;
|
||||
apoll->poll.retries = APOLL_MAX_RETRY;
|
||||
}
|
||||
|
@ -104,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx)
|
||||
return id;
|
||||
}
|
||||
|
||||
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
|
||||
void __user *arg, unsigned int nr_args)
|
||||
static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
|
||||
struct io_restriction *restrictions)
|
||||
{
|
||||
struct io_uring_restriction *res;
|
||||
size_t size;
|
||||
int i, ret;
|
||||
|
||||
/* Restrictions allowed only if rings started disabled */
|
||||
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||
return -EBADFD;
|
||||
|
||||
/* We allow only a single restrictions registration */
|
||||
if (ctx->restrictions.registered)
|
||||
return -EBUSY;
|
||||
|
||||
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
|
||||
return -EINVAL;
|
||||
|
||||
@ -130,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
|
||||
if (IS_ERR(res))
|
||||
return PTR_ERR(res);
|
||||
|
||||
ret = 0;
|
||||
ret = -EINVAL;
|
||||
|
||||
for (i = 0; i < nr_args; i++) {
|
||||
switch (res[i].opcode) {
|
||||
case IORING_RESTRICTION_REGISTER_OP:
|
||||
if (res[i].register_op >= IORING_REGISTER_LAST) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
__set_bit(res[i].register_op,
|
||||
ctx->restrictions.register_op);
|
||||
if (res[i].register_op >= IORING_REGISTER_LAST)
|
||||
goto err;
|
||||
__set_bit(res[i].register_op, restrictions->register_op);
|
||||
break;
|
||||
case IORING_RESTRICTION_SQE_OP:
|
||||
if (res[i].sqe_op >= IORING_OP_LAST) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
|
||||
if (res[i].sqe_op >= IORING_OP_LAST)
|
||||
goto err;
|
||||
__set_bit(res[i].sqe_op, restrictions->sqe_op);
|
||||
break;
|
||||
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
|
||||
ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
|
||||
restrictions->sqe_flags_allowed = res[i].sqe_flags;
|
||||
break;
|
||||
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
|
||||
ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
|
||||
restrictions->sqe_flags_required = res[i].sqe_flags;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
ret = 0;
|
||||
|
||||
err:
|
||||
kfree(res);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
|
||||
void __user *arg, unsigned int nr_args)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Restrictions allowed only if rings started disabled */
|
||||
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||
return -EBADFD;
|
||||
|
||||
/* We allow only a single restrictions registration */
|
||||
if (ctx->restrictions.registered)
|
||||
return -EBUSY;
|
||||
|
||||
ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
|
||||
/* Reset all restrictions if an error happened */
|
||||
if (ret != 0)
|
||||
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
|
||||
else
|
||||
ctx->restrictions.registered = true;
|
||||
|
||||
kfree(res);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -367,28 +369,19 @@ static int io_register_clock(struct io_ring_ctx *ctx,
|
||||
* either mapping or freeing.
|
||||
*/
|
||||
struct io_ring_ctx_rings {
|
||||
unsigned short n_ring_pages;
|
||||
unsigned short n_sqe_pages;
|
||||
struct page **ring_pages;
|
||||
struct page **sqe_pages;
|
||||
struct io_uring_sqe *sq_sqes;
|
||||
struct io_rings *rings;
|
||||
struct io_uring_sqe *sq_sqes;
|
||||
|
||||
struct io_mapped_region sq_region;
|
||||
struct io_mapped_region ring_region;
|
||||
};
|
||||
|
||||
static void io_register_free_rings(struct io_uring_params *p,
|
||||
static void io_register_free_rings(struct io_ring_ctx *ctx,
|
||||
struct io_uring_params *p,
|
||||
struct io_ring_ctx_rings *r)
|
||||
{
|
||||
if (!(p->flags & IORING_SETUP_NO_MMAP)) {
|
||||
io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
|
||||
true);
|
||||
io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
|
||||
true);
|
||||
} else {
|
||||
io_pages_free(&r->ring_pages, r->n_ring_pages);
|
||||
io_pages_free(&r->sqe_pages, r->n_sqe_pages);
|
||||
vunmap(r->rings);
|
||||
vunmap(r->sq_sqes);
|
||||
}
|
||||
io_free_region(ctx, &r->sq_region);
|
||||
io_free_region(ctx, &r->ring_region);
|
||||
}
|
||||
|
||||
#define swap_old(ctx, o, n, field) \
|
||||
@ -403,11 +396,11 @@ static void io_register_free_rings(struct io_uring_params *p,
|
||||
|
||||
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_uring_region_desc rd;
|
||||
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
|
||||
size_t size, sq_array_offset;
|
||||
unsigned i, tail, old_head;
|
||||
struct io_uring_params p;
|
||||
void *ptr;
|
||||
int ret;
|
||||
|
||||
/* for single issuer, must be owner resizing */
|
||||
@ -441,13 +434,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
if (size == SIZE_MAX)
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (!(p.flags & IORING_SETUP_NO_MMAP))
|
||||
n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
|
||||
else
|
||||
n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
|
||||
p.cq_off.user_addr, size);
|
||||
if (IS_ERR(n.rings))
|
||||
return PTR_ERR(n.rings);
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
rd.size = PAGE_ALIGN(size);
|
||||
if (p.flags & IORING_SETUP_NO_MMAP) {
|
||||
rd.user_addr = p.cq_off.user_addr;
|
||||
rd.flags |= IORING_MEM_REGION_TYPE_USER;
|
||||
}
|
||||
ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
|
||||
if (ret) {
|
||||
io_register_free_rings(ctx, &p, &n);
|
||||
return ret;
|
||||
}
|
||||
n.rings = io_region_get_ptr(&n.ring_region);
|
||||
|
||||
/*
|
||||
* At this point n.rings is shared with userspace, just like o.rings
|
||||
@ -463,7 +461,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
|
||||
|
||||
if (copy_to_user(arg, &p, sizeof(p))) {
|
||||
io_register_free_rings(&p, &n);
|
||||
io_register_free_rings(ctx, &p, &n);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
@ -472,20 +470,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
else
|
||||
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
|
||||
if (size == SIZE_MAX) {
|
||||
io_register_free_rings(&p, &n);
|
||||
io_register_free_rings(ctx, &p, &n);
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
|
||||
if (!(p.flags & IORING_SETUP_NO_MMAP))
|
||||
ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
|
||||
else
|
||||
ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
|
||||
p.sq_off.user_addr,
|
||||
size);
|
||||
if (IS_ERR(ptr)) {
|
||||
io_register_free_rings(&p, &n);
|
||||
return PTR_ERR(ptr);
|
||||
memset(&rd, 0, sizeof(rd));
|
||||
rd.size = PAGE_ALIGN(size);
|
||||
if (p.flags & IORING_SETUP_NO_MMAP) {
|
||||
rd.user_addr = p.sq_off.user_addr;
|
||||
rd.flags |= IORING_MEM_REGION_TYPE_USER;
|
||||
}
|
||||
ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
|
||||
if (ret) {
|
||||
io_register_free_rings(ctx, &p, &n);
|
||||
return ret;
|
||||
}
|
||||
n.sq_sqes = io_region_get_ptr(&n.sq_region);
|
||||
|
||||
/*
|
||||
* If using SQPOLL, park the thread
|
||||
@ -497,15 +497,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
}
|
||||
|
||||
/*
|
||||
* We'll do the swap. Grab the ctx->resize_lock, which will exclude
|
||||
* We'll do the swap. Grab the ctx->mmap_lock, which will exclude
|
||||
* any new mmap's on the ring fd. Clear out existing mappings to prevent
|
||||
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
|
||||
* existing rings beyond this point will fail. Not that it could proceed
|
||||
* at this point anyway, as the io_uring mmap side needs go grab the
|
||||
* ctx->resize_lock as well. Likewise, hold the completion lock over the
|
||||
* ctx->mmap_lock as well. Likewise, hold the completion lock over the
|
||||
* duration of the actual swap.
|
||||
*/
|
||||
mutex_lock(&ctx->resize_lock);
|
||||
mutex_lock(&ctx->mmap_lock);
|
||||
spin_lock(&ctx->completion_lock);
|
||||
o.rings = ctx->rings;
|
||||
ctx->rings = NULL;
|
||||
@ -516,7 +516,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
* Now copy SQ and CQ entries, if any. If either of the destination
|
||||
* rings can't hold what is already there, then fail the operation.
|
||||
*/
|
||||
n.sq_sqes = ptr;
|
||||
tail = READ_ONCE(o.rings->sq.tail);
|
||||
old_head = READ_ONCE(o.rings->sq.head);
|
||||
if (tail - old_head > p.sq_entries)
|
||||
@ -527,8 +526,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
|
||||
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
|
||||
}
|
||||
WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
|
||||
WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
|
||||
WRITE_ONCE(n.rings->sq.head, old_head);
|
||||
WRITE_ONCE(n.rings->sq.tail, tail);
|
||||
|
||||
tail = READ_ONCE(o.rings->cq.tail);
|
||||
old_head = READ_ONCE(o.rings->cq.head);
|
||||
@ -547,8 +546,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
|
||||
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
|
||||
}
|
||||
WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
|
||||
WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
|
||||
WRITE_ONCE(n.rings->cq.head, old_head);
|
||||
WRITE_ONCE(n.rings->cq.tail, tail);
|
||||
/* invalidate cached cqe refill */
|
||||
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
|
||||
|
||||
@ -566,16 +565,14 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
|
||||
ctx->rings = n.rings;
|
||||
ctx->sq_sqes = n.sq_sqes;
|
||||
swap_old(ctx, o, n, n_ring_pages);
|
||||
swap_old(ctx, o, n, n_sqe_pages);
|
||||
swap_old(ctx, o, n, ring_pages);
|
||||
swap_old(ctx, o, n, sqe_pages);
|
||||
swap_old(ctx, o, n, ring_region);
|
||||
swap_old(ctx, o, n, sq_region);
|
||||
to_free = &o;
|
||||
ret = 0;
|
||||
out:
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
mutex_unlock(&ctx->resize_lock);
|
||||
io_register_free_rings(&p, to_free);
|
||||
mutex_unlock(&ctx->mmap_lock);
|
||||
io_register_free_rings(ctx, &p, to_free);
|
||||
|
||||
if (ctx->sq_data)
|
||||
io_sq_thread_unpark(ctx->sq_data);
|
||||
@ -598,7 +595,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
|
||||
rd_uptr = u64_to_user_ptr(reg.region_uptr);
|
||||
if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
|
||||
return -EFAULT;
|
||||
|
||||
if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
|
||||
return -EINVAL;
|
||||
if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
|
||||
@ -613,7 +609,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
|
||||
!(ctx->flags & IORING_SETUP_R_DISABLED))
|
||||
return -EINVAL;
|
||||
|
||||
ret = io_create_region(ctx, &ctx->param_region, &rd);
|
||||
ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
|
||||
IORING_MAP_OFF_PARAM_REGION);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
|
||||
|
@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
struct io_imu_folio_data *data, int nr_folios)
|
||||
static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
struct io_imu_folio_data *data)
|
||||
{
|
||||
struct page **page_array = *pages, **new_array = NULL;
|
||||
int nr_pages_left = *nr_pages, i, j;
|
||||
int nr_folios = data->nr_folios;
|
||||
|
||||
/* Store head pages only*/
|
||||
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
|
||||
@ -667,27 +668,21 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
struct io_imu_folio_data *data)
|
||||
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
|
||||
struct io_imu_folio_data *data)
|
||||
{
|
||||
struct page **page_array = *pages;
|
||||
struct folio *folio = page_folio(page_array[0]);
|
||||
unsigned int count = 1, nr_folios = 1;
|
||||
int i;
|
||||
|
||||
if (*nr_pages <= 1)
|
||||
return false;
|
||||
|
||||
data->nr_pages_mid = folio_nr_pages(folio);
|
||||
if (data->nr_pages_mid == 1)
|
||||
return false;
|
||||
|
||||
data->folio_shift = folio_shift(folio);
|
||||
|
||||
/*
|
||||
* Check if pages are contiguous inside a folio, and all folios have
|
||||
* the same page count except for the head and tail.
|
||||
*/
|
||||
for (i = 1; i < *nr_pages; i++) {
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
if (page_folio(page_array[i]) == folio &&
|
||||
page_array[i] == page_array[i-1] + 1) {
|
||||
count++;
|
||||
@ -715,7 +710,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
|
||||
if (nr_folios == 1)
|
||||
data->nr_pages_head = count;
|
||||
|
||||
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
|
||||
data->nr_folios = nr_folios;
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
||||
@ -729,7 +725,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
||||
size_t size;
|
||||
int ret, nr_pages, i;
|
||||
struct io_imu_folio_data data;
|
||||
bool coalesced;
|
||||
bool coalesced = false;
|
||||
|
||||
if (!iov->iov_base)
|
||||
return NULL;
|
||||
@ -749,7 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
||||
}
|
||||
|
||||
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
|
||||
coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
|
||||
if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
|
||||
if (data.nr_pages_mid != 1)
|
||||
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
|
||||
}
|
||||
|
||||
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
|
||||
if (!imu)
|
||||
@ -883,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
* and advance us to the beginning.
|
||||
*/
|
||||
offset = buf_addr - imu->ubuf;
|
||||
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
|
||||
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len);
|
||||
|
||||
if (offset) {
|
||||
/*
|
||||
@ -905,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
const struct bio_vec *bvec = imu->bvec;
|
||||
|
||||
if (offset < bvec->bv_len) {
|
||||
iter->count -= offset;
|
||||
iter->iov_offset = offset;
|
||||
} else {
|
||||
unsigned long seg_skip;
|
||||
@ -916,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
|
||||
iter->bvec += seg_skip;
|
||||
iter->nr_segs -= seg_skip;
|
||||
iter->count -= bvec->bv_len + offset;
|
||||
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
|
||||
}
|
||||
}
|
||||
@ -931,6 +928,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||
int i, ret, off, nr;
|
||||
unsigned int nbufs;
|
||||
|
||||
/*
|
||||
* Accounting state is shared between the two rings; that only works if
|
||||
* both rings are accounted towards the same counters.
|
||||
*/
|
||||
if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
|
||||
return -EINVAL;
|
||||
|
||||
/* if offsets are given, must have nr specified too */
|
||||
if (!arg->nr && (arg->dst_off || arg->src_off))
|
||||
return -EINVAL;
|
||||
|
@ -40,6 +40,7 @@ struct io_imu_folio_data {
|
||||
/* For non-head/tail folios, has to be fully included */
|
||||
unsigned int nr_pages_mid;
|
||||
unsigned int folio_shift;
|
||||
unsigned int nr_folios;
|
||||
};
|
||||
|
||||
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
|
||||
@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
|
||||
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
||||
unsigned int size, unsigned int type);
|
||||
|
||||
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
|
||||
struct io_imu_folio_data *data);
|
||||
|
||||
static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
|
||||
int index)
|
||||
{
|
||||
|
212
io_uring/rw.c
212
io_uring/rw.c
@ -202,45 +202,40 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
|
||||
* mean that the underlying data can be gone at any time. But that
|
||||
* should be fixed seperately, and then this check could be killed.
|
||||
*/
|
||||
if (!(req->flags & REQ_F_REFCOUNT)) {
|
||||
if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) {
|
||||
req->flags &= ~REQ_F_NEED_CLEANUP;
|
||||
io_rw_recycle(req, issue_flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void io_rw_async_data_init(void *obj)
|
||||
{
|
||||
struct io_async_rw *rw = (struct io_async_rw *)obj;
|
||||
|
||||
rw->free_iovec = NULL;
|
||||
rw->bytes_done = 0;
|
||||
}
|
||||
|
||||
static int io_rw_alloc_async(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_async_rw *rw;
|
||||
|
||||
rw = io_alloc_cache_get(&ctx->rw_cache);
|
||||
if (rw) {
|
||||
if (rw->free_iovec) {
|
||||
kasan_mempool_unpoison_object(rw->free_iovec,
|
||||
rw->free_iov_nr * sizeof(struct iovec));
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
}
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
req->async_data = rw;
|
||||
goto done;
|
||||
rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init);
|
||||
if (!rw)
|
||||
return -ENOMEM;
|
||||
if (rw->free_iovec) {
|
||||
kasan_mempool_unpoison_object(rw->free_iovec,
|
||||
rw->free_iov_nr * sizeof(struct iovec));
|
||||
req->flags |= REQ_F_NEED_CLEANUP;
|
||||
}
|
||||
|
||||
if (!io_alloc_async_data(req)) {
|
||||
rw = req->async_data;
|
||||
rw->free_iovec = NULL;
|
||||
rw->free_iov_nr = 0;
|
||||
done:
|
||||
rw->bytes_done = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -ENOMEM;
|
||||
rw->bytes_done = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
|
||||
{
|
||||
struct io_async_rw *rw;
|
||||
int ret;
|
||||
|
||||
if (io_rw_alloc_async(req))
|
||||
return -ENOMEM;
|
||||
@ -249,12 +244,48 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
|
||||
return 0;
|
||||
|
||||
rw = req->async_data;
|
||||
ret = io_import_iovec(ddir, req, rw, 0);
|
||||
return io_import_iovec(ddir, req, rw, 0);
|
||||
}
|
||||
|
||||
static inline void io_meta_save_state(struct io_async_rw *io)
|
||||
{
|
||||
io->meta_state.seed = io->meta.seed;
|
||||
iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
|
||||
}
|
||||
|
||||
static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
|
||||
{
|
||||
if (kiocb->ki_flags & IOCB_HAS_METADATA) {
|
||||
io->meta.seed = io->meta_state.seed;
|
||||
iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
|
||||
}
|
||||
}
|
||||
|
||||
static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
|
||||
u64 attr_ptr, u64 attr_type_mask)
|
||||
{
|
||||
struct io_uring_attr_pi pi_attr;
|
||||
struct io_async_rw *io;
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
|
||||
sizeof(pi_attr)))
|
||||
return -EFAULT;
|
||||
|
||||
if (pi_attr.rsvd)
|
||||
return -EINVAL;
|
||||
|
||||
io = req->async_data;
|
||||
io->meta.flags = pi_attr.flags;
|
||||
io->meta.app_tag = pi_attr.app_tag;
|
||||
io->meta.seed = pi_attr.seed;
|
||||
ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
|
||||
pi_attr.len, &io->meta.iter);
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
|
||||
iov_iter_save_state(&rw->iter, &rw->iter_state);
|
||||
return 0;
|
||||
req->flags |= REQ_F_HAS_METADATA;
|
||||
io_meta_save_state(io);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
||||
@ -262,6 +293,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
||||
{
|
||||
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
||||
unsigned ioprio;
|
||||
u64 attr_type_mask;
|
||||
int ret;
|
||||
|
||||
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
|
||||
@ -279,11 +311,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
|
||||
rw->kiocb.ki_ioprio = get_current_ioprio();
|
||||
}
|
||||
rw->kiocb.dio_complete = NULL;
|
||||
rw->kiocb.ki_flags = 0;
|
||||
|
||||
rw->addr = READ_ONCE(sqe->addr);
|
||||
rw->len = READ_ONCE(sqe->len);
|
||||
rw->flags = READ_ONCE(sqe->rw_flags);
|
||||
return io_prep_rw_setup(req, ddir, do_import);
|
||||
ret = io_prep_rw_setup(req, ddir, do_import);
|
||||
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
attr_type_mask = READ_ONCE(sqe->attr_type_mask);
|
||||
if (attr_type_mask) {
|
||||
u64 attr_ptr;
|
||||
|
||||
/* only PI attribute is supported currently */
|
||||
if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
|
||||
return -EINVAL;
|
||||
|
||||
attr_ptr = READ_ONCE(sqe->attr_ptr);
|
||||
ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
@ -385,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
|
||||
void io_readv_writev_cleanup(struct io_kiocb *req)
|
||||
{
|
||||
io_rw_iovec_free(req->async_data);
|
||||
lockdep_assert_held(&req->ctx->uring_lock);
|
||||
io_rw_recycle(req, 0);
|
||||
}
|
||||
|
||||
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
|
||||
@ -405,17 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
static void io_resubmit_prep(struct io_kiocb *req)
|
||||
{
|
||||
struct io_async_rw *io = req->async_data;
|
||||
|
||||
iov_iter_restore(&io->iter, &io->iter_state);
|
||||
}
|
||||
|
||||
static bool io_rw_should_reissue(struct io_kiocb *req)
|
||||
{
|
||||
#ifdef CONFIG_BLOCK
|
||||
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
||||
umode_t mode = file_inode(req->file)->i_mode;
|
||||
struct io_async_rw *io = req->async_data;
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (!S_ISBLK(mode) && !S_ISREG(mode))
|
||||
@ -430,23 +475,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
|
||||
*/
|
||||
if (percpu_ref_is_dying(&ctx->refs))
|
||||
return false;
|
||||
/*
|
||||
* Play it safe and assume not safe to re-import and reissue if we're
|
||||
* not in the original thread group (or in task context).
|
||||
*/
|
||||
if (!same_thread_group(req->tctx->task, current) || !in_task())
|
||||
return false;
|
||||
|
||||
io_meta_restore(io, &rw->kiocb);
|
||||
iov_iter_restore(&io->iter, &io->iter_state);
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
static void io_resubmit_prep(struct io_kiocb *req)
|
||||
{
|
||||
}
|
||||
static bool io_rw_should_reissue(struct io_kiocb *req)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void io_req_end_write(struct io_kiocb *req)
|
||||
{
|
||||
@ -473,22 +509,16 @@ static void io_req_io_end(struct io_kiocb *req)
|
||||
}
|
||||
}
|
||||
|
||||
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
|
||||
static void __io_complete_rw_common(struct io_kiocb *req, long res)
|
||||
{
|
||||
if (unlikely(res != req->cqe.res)) {
|
||||
if (res == -EAGAIN && io_rw_should_reissue(req)) {
|
||||
/*
|
||||
* Reissue will start accounting again, finish the
|
||||
* current cycle.
|
||||
*/
|
||||
io_req_io_end(req);
|
||||
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
|
||||
return true;
|
||||
}
|
||||
if (res == req->cqe.res)
|
||||
return;
|
||||
if (res == -EAGAIN && io_rw_should_reissue(req)) {
|
||||
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
|
||||
} else {
|
||||
req_set_fail(req);
|
||||
req->cqe.res = res;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
|
||||
@ -531,8 +561,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(rw);
|
||||
|
||||
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
|
||||
if (__io_complete_rw_common(req, res))
|
||||
return;
|
||||
__io_complete_rw_common(req, res);
|
||||
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
|
||||
}
|
||||
req->io_task_work.func = io_req_rw_complete;
|
||||
@ -594,26 +623,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
|
||||
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
|
||||
req->file->f_pos = rw->kiocb.ki_pos;
|
||||
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
|
||||
if (!__io_complete_rw_common(req, ret)) {
|
||||
/*
|
||||
* Safe to call io_end from here as we're inline
|
||||
* from the submission path.
|
||||
*/
|
||||
io_req_io_end(req);
|
||||
io_req_set_res(req, final_ret,
|
||||
io_put_kbuf(req, ret, issue_flags));
|
||||
io_req_rw_cleanup(req, issue_flags);
|
||||
return IOU_OK;
|
||||
}
|
||||
__io_complete_rw_common(req, ret);
|
||||
/*
|
||||
* Safe to call io_end from here as we're inline
|
||||
* from the submission path.
|
||||
*/
|
||||
io_req_io_end(req);
|
||||
io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
|
||||
io_req_rw_cleanup(req, issue_flags);
|
||||
return IOU_OK;
|
||||
} else {
|
||||
io_rw_done(&rw->kiocb, ret);
|
||||
}
|
||||
|
||||
if (req->flags & REQ_F_REISSUE) {
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
io_resubmit_prep(req);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return IOU_ISSUE_SKIP_COMPLETE;
|
||||
}
|
||||
|
||||
@ -736,8 +758,11 @@ static bool io_rw_should_retry(struct io_kiocb *req)
|
||||
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
|
||||
struct kiocb *kiocb = &rw->kiocb;
|
||||
|
||||
/* never retry for NOWAIT, we just complete with -EAGAIN */
|
||||
if (req->flags & REQ_F_NOWAIT)
|
||||
/*
|
||||
* Never retry for NOWAIT or a request with metadata, we just complete
|
||||
* with -EAGAIN.
|
||||
*/
|
||||
if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA))
|
||||
return false;
|
||||
|
||||
/* Only for buffered IO */
|
||||
@ -828,6 +853,19 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
|
||||
kiocb->ki_complete = io_complete_rw;
|
||||
}
|
||||
|
||||
if (req->flags & REQ_F_HAS_METADATA) {
|
||||
struct io_async_rw *io = req->async_data;
|
||||
|
||||
/*
|
||||
* We have a union of meta fields with wpq used for buffered-io
|
||||
* in io_async_rw, so fail it here.
|
||||
*/
|
||||
if (!(req->file->f_flags & O_DIRECT))
|
||||
return -EOPNOTSUPP;
|
||||
kiocb->ki_flags |= IOCB_HAS_METADATA;
|
||||
kiocb->private = &io->meta;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -876,8 +914,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
if (ret == -EOPNOTSUPP && force_nonblock)
|
||||
ret = -EAGAIN;
|
||||
|
||||
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
if (ret == -EAGAIN) {
|
||||
/* If we can poll, just do that. */
|
||||
if (io_file_can_poll(req))
|
||||
return -EAGAIN;
|
||||
@ -902,6 +939,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
|
||||
* manually if we need to.
|
||||
*/
|
||||
iov_iter_restore(&io->iter, &io->iter_state);
|
||||
io_meta_restore(io, kiocb);
|
||||
|
||||
do {
|
||||
/*
|
||||
@ -1087,11 +1125,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
else
|
||||
ret2 = -EINVAL;
|
||||
|
||||
if (req->flags & REQ_F_REISSUE) {
|
||||
req->flags &= ~REQ_F_REISSUE;
|
||||
ret2 = -EAGAIN;
|
||||
}
|
||||
|
||||
/*
|
||||
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
|
||||
* retry them without IOCB_NOWAIT.
|
||||
@ -1127,6 +1160,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
|
||||
} else {
|
||||
ret_eagain:
|
||||
iov_iter_restore(&io->iter, &io->iter_state);
|
||||
io_meta_restore(io, kiocb);
|
||||
if (kiocb->ki_flags & IOCB_WRITE)
|
||||
io_req_end_write(req);
|
||||
return -EAGAIN;
|
||||
|
@ -2,6 +2,11 @@
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
struct io_meta_state {
|
||||
u32 seed;
|
||||
struct iov_iter_state iter_meta;
|
||||
};
|
||||
|
||||
struct io_async_rw {
|
||||
size_t bytes_done;
|
||||
struct iov_iter iter;
|
||||
@ -9,7 +14,14 @@ struct io_async_rw {
|
||||
struct iovec fast_iov;
|
||||
struct iovec *free_iovec;
|
||||
int free_iov_nr;
|
||||
struct wait_page_queue wpq;
|
||||
/* wpq is for buffered io, while meta fields are used with direct io */
|
||||
union {
|
||||
struct wait_page_queue wpq;
|
||||
struct {
|
||||
struct uio_meta meta;
|
||||
struct io_meta_state meta_state;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
|
@ -544,10 +544,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
|
||||
|
||||
if (WARN_ON_ONCE(req_has_async_data(req)))
|
||||
return -EFAULT;
|
||||
if (io_alloc_async_data(req))
|
||||
data = io_uring_alloc_async_data_nocache(req);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
data = req->async_data;
|
||||
data->req = req;
|
||||
data->flags = flags;
|
||||
|
||||
|
@ -16,26 +16,6 @@
|
||||
#include "rsrc.h"
|
||||
#include "uring_cmd.h"
|
||||
|
||||
static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req)
|
||||
{
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_uring_cmd_data *cache;
|
||||
|
||||
cache = io_alloc_cache_get(&ctx->uring_cache);
|
||||
if (cache) {
|
||||
cache->op_data = NULL;
|
||||
req->flags |= REQ_F_ASYNC_DATA;
|
||||
req->async_data = cache;
|
||||
return cache;
|
||||
}
|
||||
if (!io_alloc_async_data(req)) {
|
||||
cache = req->async_data;
|
||||
cache->op_data = NULL;
|
||||
return cache;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
||||
@ -130,7 +110,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
||||
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
|
||||
|
||||
if (current->flags & (PF_EXITING | PF_KTHREAD))
|
||||
if (io_should_terminate_tw())
|
||||
flags |= IO_URING_F_TASK_DEAD;
|
||||
|
||||
/* task_work executor checks the deffered list completion */
|
||||
@ -188,14 +168,22 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
|
||||
|
||||
static void io_uring_cmd_init_once(void *obj)
|
||||
{
|
||||
struct io_uring_cmd_data *data = obj;
|
||||
|
||||
data->op_data = NULL;
|
||||
}
|
||||
|
||||
static int io_uring_cmd_prep_setup(struct io_kiocb *req,
|
||||
const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
||||
struct io_uring_cmd_data *cache;
|
||||
|
||||
cache = io_uring_async_get(req);
|
||||
if (unlikely(!cache))
|
||||
cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req,
|
||||
io_uring_cmd_init_once);
|
||||
if (!cache)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!(req->flags & REQ_F_FORCE_ASYNC)) {
|
||||
|
@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
|
||||
struct io_waitid_async *iwa;
|
||||
int ret;
|
||||
|
||||
if (io_alloc_async_data(req))
|
||||
iwa = io_uring_alloc_async_data_nocache(req);
|
||||
if (!iwa)
|
||||
return -ENOMEM;
|
||||
|
||||
iwa = req->async_data;
|
||||
iwa->req = req;
|
||||
|
||||
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
|
||||
|
Loading…
Reference in New Issue
Block a user