for-6.16/io_uring-20250523

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmgwnDgQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpgHZEADA1ym0ihHRjU2kTlXXOdkLLOl+o1RCHUjr
 KNf6sELGgyDC5FL/hAWdsjonInY4MLbJW0eNHEuuK8iFcn3wSHuHPXhRJXx/4cOs
 GGVLTd+Jm8ih4UL/GeLrBe3ehW9UUOtz1TCYzho0bdXHQWjruCFTqB5OzPQFMGQW
 R/lwXVNfjgGno5JhBnsrwz3ZnAfAnJhxqmc0GFHaa/nVF1OREYW/HS75EPFNiFgp
 Aevilw5QyrA2gDlZ+zCUwaGKAEl32yZCI6LZpI4kMtPK1reEbgFTrzIaCZ/OZCYM
 DVdBVEeuOmcBYIKbitD/+fcLNXHMrSJSWvUSXR4GuRNVkCTIAcEMKM2bX8VY7gmJ
 7ZQIo0EL2mSwmewHIYnvf9w/qrNYR0NyUt2v4U4rA2wj6e5w1EYMriP94wKdBGvD
 RNxja429N3fg3aBIkdQ6iYSVJRgE7DCo7dnKrEqglZPb32LOiNoOoou9shI5tb25
 8X7u0HzbpwKY/XByXZ2IaX7PYK2iFqkJjFYlGehtF97W85LGEvkDFU6fcBdjBO8r
 umgeE5O+lR+cf68JTJ6P34A7bBg71AXO3ytIuWunG56/0yu/FHDCjhBWE5ZjEhGR
 u2YhAGPRDQsJlSlxx8TXoKyYWP55NqdeyxYrmku/fZLn5WNVXOFeRlUDAZsF7mU7
 nuiOt9j4WA==
 =k8SF
 -----END PGP SIGNATURE-----

Merge tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Avoid indirect function calls in io-wq for executing and freeing
   work.

   The design of io-wq is such that it can be a generic mechanism, but
   as it's just used by io_uring now, may as well avoid these indirect
   calls

 - Clean up registered buffers for networking

 - Add support for IORING_OP_PIPE. Pretty straight forward, allows
   creating pipes with io_uring, particularly useful for having these be
   instantiated as direct descriptors

 - Clean up the coalescing support fore registered buffers

 - Add support for multiple interface queues for zero-copy rx
   networking. As this feature was merged for 6.15 it supported just a
   single ifq per ring

 - Clean up the eventfd support

 - Add dma-buf support to zero-copy rx

 - Clean up and improving the request draining support

 - Clean up provided buffer support, most notably with an eye toward
   making the legacy support less intrusive

 - Minor fdinfo cleanups, dropping support for dumping what credentials
   are registered

 - Improve support for overflow CQE handling, getting rid of GFP_ATOMIC
   for allocating overflow entries where possible

 - Improve detection of cases where io-wq doesn't need to spawn a new
   worker unnecessarily

 - Various little cleanups

* tag 'for-6.16/io_uring-20250523' of git://git.kernel.dk/linux: (59 commits)
  io_uring/cmd: warn on reg buf imports by ineligible cmds
  io_uring/io-wq: only create a new worker if it can make progress
  io_uring/io-wq: ignore non-busy worker going to sleep
  io_uring/io-wq: move hash helpers to the top
  trace/io_uring: fix io_uring_local_work_run ctx documentation
  io_uring: finish IOU_OK -> IOU_COMPLETE transition
  io_uring: add new helpers for posting overflows
  io_uring: pass in struct io_big_cqe to io_alloc_ocqe()
  io_uring: make io_alloc_ocqe() take a struct io_cqe pointer
  io_uring: split alloc and add of overflow
  io_uring: open code io_req_cqe_overflow()
  io_uring/fdinfo: get rid of dumping credentials
  io_uring/fdinfo: only compile if CONFIG_PROC_FS is set
  io_uring/kbuf: unify legacy buf provision and removal
  io_uring/kbuf: refactor __io_remove_buffers
  io_uring/kbuf: don't compute size twice on prep
  io_uring/kbuf: drop extra vars in io_register_pbuf_ring
  io_uring/kbuf: use mem_is_zero()
  io_uring/kbuf: account ring io_buffer_list memory
  io_uring: drain based on allocates reqs
  ...
This commit is contained in:
Linus Torvalds 2025-05-26 12:13:22 -07:00
commit 49fffac983
45 changed files with 966 additions and 722 deletions

View File

@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
IO_URING_F_TASK_DEAD = (1 << 13),
};
struct io_zcrx_ifq;
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@ -343,7 +341,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
unsigned cq_extra;
void *cq_wait_arg;
size_t cq_wait_size;
@ -394,7 +391,8 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
struct io_zcrx_ifq *ifq;
/* Stores zcrx object pointers of type struct io_zcrx_ifq */
struct xarray zcrx_ctxs;
u32 pers_next;
struct xarray personalities;
@ -418,6 +416,7 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work;
struct list_head defer_list;
unsigned nr_drained;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
@ -436,6 +435,7 @@ struct io_ring_ctx {
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
unsigned nr_req_allocated;
/*
* Protection for resize vs mmap races - both the mmap and resize
@ -448,8 +448,6 @@ struct io_ring_ctx {
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
struct io_mapped_region zcrx_region;
};
/*
@ -653,8 +651,7 @@ struct io_kiocb {
u8 iopoll_completed;
/*
* Can be either a fixed buffer index, or used with provided buffers.
* For the latter, before issue it points to the buffer group ID,
* and after selection it points to the buffer ID itself.
* For the latter, it points to the selected buffer ID.
*/
u16 buf_index;
@ -713,7 +710,7 @@ struct io_kiocb {
const struct cred *creds;
struct io_wq_work work;
struct {
struct io_big_cqe {
u64 extra1;
u64 extra2;
} big_cqe;

View File

@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write,
/*
* io_uring_local_work_run - ran ring local task work
*
* @tctx: pointer to a io_uring_ctx
* @ctx: pointer to an io_ring_ctx
* @count: how many functions it ran
* @loops: how many loops it ran
*

View File

@ -73,6 +73,7 @@ struct io_uring_sqe {
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
__u32 pipe_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@ -287,6 +288,7 @@ enum io_uring_op {
IORING_OP_EPOLL_WAIT,
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
IORING_OP_PIPE,
/* this goes last, obviously */
IORING_OP_LAST,
@ -992,12 +994,16 @@ struct io_uring_zcrx_offsets {
__u64 __resv[2];
};
enum io_uring_zcrx_area_flags {
IORING_ZCRX_AREA_DMABUF = 1,
};
struct io_uring_zcrx_area_reg {
__u64 addr;
__u64 len;
__u64 rq_area_token;
__u32 flags;
__u32 __resv1;
__u32 dmabuf_fd;
__u64 __resv2[2];
};

View File

@ -7,11 +7,11 @@ GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \
tctx.o filetable.o rw.o poll.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
statx.o timeout.o fdinfo.o cancel.o \
statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o

View File

@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
#else
return -EOPNOTSUPP;
#endif
@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -229,7 +229,7 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
static int __io_sync_cancel(struct io_uring_task *tctx,

83
io_uring/cmd_net.c Normal file
View File

@ -0,0 +1,83 @@
#include <asm/ioctls.h>
#include <linux/io_uring/net.h>
#include <net/sock.h>
#include "uring_cmd.h"
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
const struct io_uring_sqe *sqe = cmd->sqe;
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
int optlen, optname, level, err;
void __user *optval;
level = READ_ONCE(sqe->level);
if (level != SOL_SOCKET)
return -EOPNOTSUPP;
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
optname = READ_ONCE(sqe->optname);
optlen = READ_ONCE(sqe->optlen);
err = do_sock_getsockopt(sock, compat, level, optname,
USER_SOCKPTR(optval),
KERNEL_SOCKPTR(&optlen));
if (err)
return err;
/* On success, return optlen */
return optlen;
}
static inline int io_uring_cmd_setsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
const struct io_uring_sqe *sqe = cmd->sqe;
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
int optname, optlen, level;
void __user *optval;
sockptr_t optval_s;
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
optname = READ_ONCE(sqe->optname);
optlen = READ_ONCE(sqe->optlen);
level = READ_ONCE(sqe->level);
optval_s = USER_SOCKPTR(optval);
return do_sock_setsockopt(sock, compat, level, optname, optval_s,
optlen);
}
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
switch (cmd->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_SIOCOUTQ:
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_GETSOCKOPT:
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);

View File

@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_put(ev_fd);
}
static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
{
if (put_ref)
io_eventfd_put(ev_fd);
rcu_read_unlock();
}
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
* an async worker. If ev_fd isn't valid, obviously return false.
* an async worker.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
if (ev_fd)
return !ev_fd->eventfd_async || io_wq_current_is_worker();
return false;
return !ev_fd->eventfd_async || io_wq_current_is_worker();
}
/*
* On success, returns with an ev_fd reference grabbed and the RCU read
* lock held.
*/
static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return NULL;
return;
rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
guard(rcu)();
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
return ev_fd;
rcu_read_unlock();
return NULL;
}
void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = io_eventfd_grab(ctx);
if (ev_fd)
io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
}
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = io_eventfd_grab(ctx);
if (ev_fd) {
bool skip, put_ref = true;
if (!ev_fd)
return;
if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
return;
if (cqe_event) {
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (!skip)
put_ref = __io_eventfd_signal(ev_fd);
io_eventfd_release(ev_fd, put_ref);
}
if (skip || __io_eventfd_signal(ev_fd))
io_eventfd_put(ev_fd);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

View File

@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
void io_eventfd_signal(struct io_ring_ctx *ctx);
void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);

View File

@ -15,37 +15,6 @@
#include "cancel.h"
#include "rsrc.h"
#ifdef CONFIG_PROC_FS
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
kernel_cap_t cap;
int g;
seq_printf(m, "%5d\n", id);
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
seq_puts(m, "\n\tGroups:\t");
gi = cred->group_info;
for (g = 0; g < gi->ngroups; g++) {
seq_put_decimal_ull(m, g ? " " : "",
from_kgid_munged(uns, gi->gid[g]));
}
seq_puts(m, "\n\tCapEff:\t");
cap = cred->cap_effective;
seq_put_hex_ll(m, NULL, cap.val, 16);
seq_putc(m, '\n');
return 0;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m,
@ -214,14 +183,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
else
seq_printf(m, "%5u: <none>\n", i);
}
if (!xa_empty(&ctx->personalities)) {
unsigned long index;
const struct cred *cred;
seq_printf(m, "Personalities:\n");
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
@ -264,4 +225,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
mutex_unlock(&ctx->uring_lock);
}
}
#endif

View File

@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_renameat_cleanup(struct io_kiocb *req)
@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_link_cleanup(struct io_kiocb *req)

View File

@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
kfree(futexv);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
return IOU_OK;
return IOU_COMPLETE;
}
/*
@ -311,7 +311,7 @@ done:
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(ifd);
return IOU_OK;
return IOU_COMPLETE;
}
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -114,9 +114,6 @@ enum {
struct io_wq {
unsigned long state;
free_work_fn *free_work;
io_wq_work_fn *do_work;
struct io_wq_hash *hash;
atomic_t worker_refs;
@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);
static inline unsigned int __io_get_work_hash(unsigned int work_flags)
{
return work_flags >> IO_WQ_HASH_SHIFT;
}
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return __io_get_work_hash(atomic_read(&work->flags));
}
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@ -412,6 +419,30 @@ fail:
return false;
}
/* Defer if current and next work are both hashed to the same chain */
static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct)
{
unsigned int hash, work_flags;
struct io_wq_work *next;
lockdep_assert_held(&acct->lock);
work_flags = atomic_read(&work->flags);
if (!__io_wq_is_hashed(work_flags))
return false;
/* should not happen, io_acct_run_queue() said we had work */
if (wq_list_empty(&acct->work_list))
return true;
hash = __io_get_work_hash(work_flags);
next = container_of(acct->work_list.first, struct io_wq_work, list);
work_flags = atomic_read(&next->flags);
if (!__io_wq_is_hashed(work_flags))
return false;
return hash == __io_get_work_hash(work_flags);
}
static void io_wq_dec_running(struct io_worker *worker)
{
struct io_wq_acct *acct = io_wq_get_acct(worker);
@ -422,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!atomic_dec_and_test(&acct->nr_running))
return;
if (!worker->cur_work)
return;
if (!io_acct_run_queue(acct))
return;
if (io_wq_hash_defer(worker->cur_work, acct)) {
raw_spin_unlock(&acct->lock);
return;
}
raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running);
@ -457,16 +494,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker)
}
}
static inline unsigned int __io_get_work_hash(unsigned int work_flags)
{
return work_flags >> IO_WQ_HASH_SHIFT;
}
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return __io_get_work_hash(atomic_read(&work->flags));
}
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
{
bool ret = false;
@ -612,10 +639,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
if (do_kill &&
(work_flags & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
io_wq_submit_work(work);
io_assign_current_work(worker, NULL);
linked = wq->free_work(work);
linked = io_wq_free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@ -934,8 +961,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
work = wq->free_work(work);
io_wq_submit_work(work);
work = io_wq_free_work(work);
} while (work);
}
@ -1195,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, i;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
@ -1206,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
wq->free_work = data->free_work;
wq->do_work = data->do_work;
ret = -ENOMEM;

View File

@ -21,9 +21,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_hash {
refcount_t refs;
unsigned long map;
@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);

View File

@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_HLIST_HEAD(&ctx->waitid_list);
xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
@ -380,25 +381,6 @@ err:
return NULL;
}
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
ctx->cq_extra--;
}
static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
struct io_ring_ctx *ctx = req->ctx;
return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
}
return false;
}
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
static unsigned io_linked_nr(struct io_kiocb *req)
{
struct io_kiocb *tmp;
unsigned nr = 0;
io_for_each_link(tmp, req)
nr++;
return nr;
}
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
bool drain_seen = false, first = true;
lockdep_assert_held(&ctx->uring_lock);
__io_req_caches_free(ctx);
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
if (req_need_defer(de->req, de->seq))
break;
drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
return;
list_del_init(&de->list);
ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
first = false;
}
spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
if (ctx->drain_active)
io_queue_deferred(ctx);
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
@ -701,27 +698,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags, u64 extra1, u64 extra2)
static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
struct io_overflow_cqe *ocqe)
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
lockdep_assert_held(&ctx->completion_lock);
if (is_cqe32)
ocq_size += sizeof(struct io_uring_cqe);
ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
struct io_rings *r = ctx->rings;
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
io_account_cq_overflow(ctx);
WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
@ -730,23 +720,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
if (is_cqe32) {
ocqe->cqe.big_cqe[0] = extra1;
ocqe->cqe.big_cqe[1] = extra2;
}
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
static void io_req_cqe_overflow(struct io_kiocb *req)
static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
struct io_cqe *cqe,
struct io_big_cqe *big_cqe, gfp_t gfp)
{
io_cqring_event_overflow(req->ctx, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
req->big_cqe.extra1, req->big_cqe.extra2);
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
if (is_cqe32)
ocq_size += sizeof(struct io_uring_cqe);
ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
if (ocqe) {
ocqe->cqe.user_data = cqe->user_data;
ocqe->cqe.res = cqe->res;
ocqe->cqe.flags = cqe->flags;
if (is_cqe32 && big_cqe) {
ocqe->cqe.big_cqe[0] = big_cqe->extra1;
ocqe->cqe.big_cqe[1] = big_cqe->extra2;
}
}
if (big_cqe)
big_cqe->extra1 = big_cqe->extra2 = 0;
return ocqe;
}
/*
@ -791,13 +793,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
{
struct io_uring_cqe *cqe;
ctx->cq_extra++;
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
if (likely(io_get_cqe(ctx, &cqe))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@ -814,14 +809,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
{
return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
}
static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe,
struct io_big_cqe *big_cqe)
{
struct io_overflow_cqe *ocqe;
ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL);
spin_lock(&ctx->completion_lock);
io_cqring_add_overflow(ctx, ocqe);
spin_unlock(&ctx->completion_lock);
}
static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
struct io_cqe *cqe,
struct io_big_cqe *big_cqe)
{
struct io_overflow_cqe *ocqe;
ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
return io_cqring_add_overflow(ctx, ocqe);
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
if (unlikely(!filled)) {
struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
filled = io_cqe_overflow_locked(ctx, &cqe, NULL);
}
io_cq_unlock_post(ctx);
return filled;
}
@ -832,10 +856,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
lockdep_assert_held(&ctx->uring_lock);
lockdep_assert(ctx->lockless_cq);
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
spin_unlock(&ctx->completion_lock);
struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
io_cqe_overflow(ctx, &cqe, NULL);
}
ctx->submit_state.cq_flush = true;
}
@ -924,22 +951,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
io_req_complete_defer(req);
}
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
*/
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
req->ctx = ctx;
req->buf_node = NULL;
req->file_node = NULL;
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
memset(&req->cqe, 0, sizeof(req->cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@ -949,7 +960,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO;
void *reqs[IO_REQ_ALLOC_BATCH];
int ret;
@ -967,10 +978,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
}
percpu_ref_get_many(&ctx->refs, ret);
ctx->nr_req_allocated += ret;
while (ret--) {
struct io_kiocb *req = reqs[ret];
io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
}
return true;
@ -1192,7 +1204,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
io_eventfd_signal(ctx);
io_eventfd_signal(ctx, false);
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
@ -1384,6 +1396,16 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
{
if (req->file_node) {
io_put_rsrc_node(req->ctx, req->file_node);
req->file_node = NULL;
}
if (req->flags & REQ_F_BUF_NODE)
io_put_rsrc_node(req->ctx, req->buf_node);
}
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@ -1444,13 +1466,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
} else {
io_req_cqe_overflow(req);
}
if (ctx->lockless_cq)
io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
else
io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
}
}
__io_cq_unlock_post(ctx);
@ -1459,6 +1478,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
if (unlikely(ctx->drain_active))
io_queue_deferred(ctx);
ctx->submit_state.cq_flush = false;
}
@ -1646,56 +1669,28 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
struct io_kiocb *cur;
/* need original cached_sq_head, but it was increased for each req */
io_for_each_link(cur, req)
seq--;
return seq;
}
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
bool drain = req->flags & IOSQE_IO_DRAIN;
struct io_defer_entry *de;
int ret;
u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
queue:
ctx->drain_active = false;
io_req_task_queue(req);
de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
io_req_defer_failed(req, -ENOMEM);
return;
}
spin_unlock(&ctx->completion_lock);
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
io_req_defer_failed(req, ret);
return;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
goto queue;
}
trace_io_uring_defer(req);
de->req = req;
de->seq = seq;
ctx->nr_drained += io_linked_nr(req);
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
io_queue_deferred(ctx);
if (!drain && list_empty(&ctx->defer_list))
ctx->drain_active = false;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@ -1757,7 +1752,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_issue_sqe(req, issue_flags, def);
if (ret == IOU_OK) {
if (ret == IOU_COMPLETE) {
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_defer(req);
else
@ -1816,7 +1811,7 @@ void io_wq_submit_work(struct io_wq_work *work)
bool needs_poll = false;
int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
/* one will be dropped by io_wq_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
__io_req_set_refcount(req, 2);
else
@ -1914,7 +1909,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
if (node) {
io_req_assign_rsrc_node(&req->file_node, node);
node->refs++;
req->file_node = node;
req->flags |= io_slot_flags(node);
file = io_slot_file(node);
}
@ -2047,7 +2043,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
int personality;
u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
req->ctx = ctx;
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
sqe_flags = READ_ONCE(sqe->flags);
@ -2278,10 +2274,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
(!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
/* drop invalid entries */
spin_lock(&ctx->completion_lock);
ctx->cq_extra--;
spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
@ -2699,21 +2691,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
return off;
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
int nr = 0;
mutex_lock(&ctx->uring_lock);
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
kmem_cache_free(req_cachep, req);
nr++;
}
if (nr)
if (nr) {
ctx->nr_req_allocated -= nr;
percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
}
static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
{
guard(mutex)(&ctx->uring_lock);
__io_req_caches_free(ctx);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@ -2749,6 +2746,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
WARN_ON_ONCE(ctx->nr_req_allocated);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
io_napi_free(ctx);
@ -2883,7 +2883,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
if (ctx->ifq) {
if (!xa_empty(&ctx->zcrx_ctxs)) {
mutex_lock(&ctx->uring_lock);
io_shutdown_zcrx_ifqs(ctx);
mutex_unlock(&ctx->uring_lock);
@ -3015,20 +3015,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
@ -3103,8 +3102,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
ret |= io_poll_remove_all(ctx, tctx, cancel_all);
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);

View File

@ -19,7 +19,6 @@
#endif
enum {
IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
IOU_COMPLETE = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@ -196,7 +195,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
{
io_lockdep_assert_cq_locked(ctx);
ctx->cq_extra++;
ctx->submit_state.cq_flush = true;
return io_get_cqe(ctx, cqe_ret);
}
@ -414,7 +412,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}

View File

@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
return;
req->buf_index = req->kbuf->bgid;
req->flags &= ~REQ_F_BUFFER_SELECTED;
kfree(req->kbuf);
req->kbuf = NULL;
@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return true;
@ -193,7 +191,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags)
unsigned buf_group, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@ -201,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
io_ring_submit_lock(req->ctx, issue_flags);
bl = io_buffer_get_list(ctx, req->buf_index);
bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
bl = io_buffer_get_list(ctx, req->buf_index);
bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
goto out_unlock;
@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
lockdep_assert_held(&ctx->uring_lock);
bl = io_buffer_get_list(ctx, req->buf_index);
bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
return -ENOENT;
@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
struct io_buffer_list *bl = req->buf_list;
bool ret = true;
if (bl) {
if (bl)
ret = io_kbuf_commit(req, bl, len, nr);
req->buf_index = bl->bgid;
}
req->flags &= ~REQ_F_BUFFER_RING;
return ret;
}
@ -379,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
return ret;
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
struct io_buffer_list *bl,
unsigned long nbufs)
{
unsigned i = 0;
/* shouldn't happen */
if (!nbufs)
return 0;
if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head;
io_free_region(ctx, &bl->region);
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->flags &= ~IOBL_BUF_RING;
return i;
}
unsigned long i = 0;
struct io_buffer *nxt;
/* protects io_buffers_cache */
lockdep_assert_held(&ctx->uring_lock);
WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
kfree(nxt);
if (++i == nbufs)
return i;
cond_resched();
}
return i;
}
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
__io_remove_buffers(ctx, bl, -1U);
if (bl->flags & IOBL_BUF_RING)
io_free_region(ctx, &bl->region);
else
io_remove_buffers_legacy(ctx, bl, -1U);
kfree(bl);
}
@ -465,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
if (bl) {
ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */
if (!(bl->flags & IOBL_BUF_RING))
ret = __io_remove_buffers(ctx, bl, p->nbufs);
}
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
@ -512,8 +473,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
@ -552,49 +511,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
static int __io_manage_buffers_legacy(struct io_kiocb *req,
struct io_buffer_list *bl)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
int ret;
if (!bl) {
if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
return -ENOENT;
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl)
return -ENOMEM;
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(req->ctx, bl, p->bgid);
if (ret) {
kfree(bl);
return ret;
}
}
/* can't use provide/remove buffers command on mapped buffers */
if (bl->flags & IOBL_BUF_RING)
return -EINVAL;
if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
return io_add_buffers(req->ctx, p, bl);
return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
}
int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
int ret;
io_ring_submit_lock(ctx, issue_flags);
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl) {
ret = -ENOMEM;
goto err;
}
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
kfree(bl);
goto err;
}
}
/* can't add buffers via this command for a mapped buffer ring */
if (bl->flags & IOBL_BUF_RING) {
ret = -EINVAL;
goto err;
}
ret = io_add_buffers(ctx, p, bl);
err:
ret = __io_manage_buffers_legacy(req, bl);
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
struct io_buffer_list *bl;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
@ -605,8 +571,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
@ -624,7 +589,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
io_destroy_bl(ctx, bl);
}
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl)
return -ENOMEM;
@ -669,7 +634,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
fail:
io_free_region(ctx, &bl->region);
kfree(free_bl);
kfree(bl);
return ret;
}
@ -682,9 +647,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
@ -704,14 +667,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
int i;
if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
if (buf_status.resv[i])
return -EINVAL;
if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
return -EINVAL;
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)

View File

@ -55,20 +55,19 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
unsigned buf_group;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags);
unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags);
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
@ -94,7 +93,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}

View File

@ -13,6 +13,7 @@
#include "memmap.h"
#include "kbuf.h"
#include "rsrc.h"
#include "zcrx.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
loff_t pgoff)
{
loff_t offset = pgoff << PAGE_SHIFT;
unsigned int bgid;
unsigned int id;
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
case IORING_OFF_SQES:
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
return io_pbuf_get_region(ctx, bgid);
id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
return io_pbuf_get_region(ctx, id);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
case IORING_MAP_OFF_ZCRX_REGION:
return &ctx->zcrx_region;
id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
return io_zcrx_get_region(ctx, id);
}
return NULL;
}

View File

@ -4,7 +4,9 @@
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
#define IORING_OFF_ZCRX_SHIFT 16
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);

View File

@ -328,7 +328,7 @@ done:
req_set_fail(req);
}
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)

View File

@ -18,7 +18,6 @@
#include "rsrc.h"
#include "zcrx.h"
#if defined(CONFIG_NET)
struct io_shutdown {
struct file *file;
int how;
@ -129,7 +128,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_shutdown_sock(sock, shutdown->how);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
static bool io_net_retry(struct socket *sock, int flags)
@ -190,7 +189,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
sr->done_io = 0;
sr->retry = false;
sr->len = 0; /* get from the provided buffer */
req->buf_index = sr->buf_group;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@ -359,15 +357,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_name = &kmsg->addr;
kmsg->msg.msg_namelen = addr_len;
}
if (sr->flags & IORING_RECVSEND_FIXED_BUF)
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
if (!io_do_buffer_select(req)) {
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
&kmsg->msg.msg_iter);
if (unlikely(ret < 0))
return ret;
}
return 0;
if (req->flags & REQ_F_BUFFER_SELECT)
return 0;
return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
}
static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -409,13 +405,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
if (req->flags & REQ_F_BUFFER_SELECT)
sr->buf_group = req->buf_index;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_SENDMSG)
return -EINVAL;
if (!(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
sr->msg_flags |= MSG_WAITALL;
sr->buf_group = req->buf_index;
req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
@ -507,7 +502,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
/* Otherwise stop bundle and use the current result. */
finish:
io_req_set_res(req, *ret, cflags);
*ret = IOU_OK;
*ret = IOU_COMPLETE;
return true;
}
@ -558,7 +553,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
else if (sr->done_io)
ret = sr->done_io;
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
@ -571,6 +566,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
.buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg;
int ret;
kmsg = io_msg_alloc_async(req);
if (unlikely(!kmsg))
@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
kmsg->msg.msg_iocb = NULL;
kmsg->msg.msg_ubuf = NULL;
if (!io_do_buffer_select(req)) {
ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
&kmsg->msg.msg_iter);
if (unlikely(ret))
return ret;
}
return 0;
if (req->flags & REQ_F_BUFFER_SELECT)
return 0;
return import_ubuf(ITER_DEST, sr->buf, sr->len,
&kmsg->msg.msg_iter);
}
return io_recvmsg_copy_hdr(req, kmsg);
@ -991,7 +983,7 @@ retry_multishot:
void __user *buf;
size_t len = sr->len;
buf = io_buffer_select(req, &len, issue_flags);
buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
@ -1069,6 +1061,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.iovs = &kmsg->fast_iov,
.nr_iovs = 1,
.mode = KBUF_MODE_EXPAND,
.buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@ -1101,7 +1094,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
void __user *buf;
*len = sr->len;
buf = io_buffer_select(req, len, issue_flags);
buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
sr->buf = buf;
@ -1197,16 +1190,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
unsigned ifq_idx;
if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr ||
sqe->addr3))
if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
return -EINVAL;
ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
if (ifq_idx != 0)
return -EINVAL;
zc->ifq = req->ctx->ifq;
zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
if (!zc->ifq)
return -EINVAL;
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
zc->msg_flags = READ_ONCE(sqe->msg_flags);
@ -1327,8 +1318,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -ENOMEM;
if (req->opcode == IORING_OP_SEND_ZC) {
if (zc->flags & IORING_RECVSEND_FIXED_BUF)
req->flags |= REQ_F_IMPORT_BUFFER;
ret = io_send_setup(req, sqe);
} else {
if (unlikely(sqe->addr2 || sqe->file_index))
@ -1476,7 +1465,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
return IOU_COMPLETE;
}
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
@ -1547,7 +1536,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
return IOU_COMPLETE;
}
void io_sendrecv_fail(struct io_kiocb *req)
@ -1711,7 +1700,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
sock->file_slot);
}
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -1778,7 +1767,7 @@ out:
req_set_fail(req);
io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -1852,4 +1841,3 @@ void io_netmsg_cache_free(const void *entry)
io_vec_free(&kmsg->vec);
kfree(kmsg);
}
#endif

View File

@ -68,5 +68,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL;
notif->ctx = ctx;
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;

View File

@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_provide_buffers_prep,
.issue = io_provide_buffers,
.issue = io_manage_buffers_legacy,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_remove_buffers_prep,
.issue = io_remove_buffers,
.issue = io_manage_buffers_legacy,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_prep_writev_fixed,
.issue = io_write,
},
[IORING_OP_PIPE] = {
.prep = io_pipe_prep,
.issue = io_pipe,
},
};
const struct io_cold_def io_cold_defs[] = {
@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_PIPE] = {
.name = "PIPE",
},
};
const char *io_uring_get_opcode(u8 opcode)

View File

@ -6,6 +6,8 @@
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
#include <linux/pipe_fs_i.h>
#include <linux/watch_queue.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@ -169,7 +171,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
@ -257,7 +259,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -300,5 +302,136 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
struct io_pipe {
struct file *file;
int __user *fds;
int flags;
int file_slot;
unsigned long nofile;
};
int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
if (sqe->fd || sqe->off || sqe->addr3)
return -EINVAL;
p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
p->flags = READ_ONCE(sqe->pipe_flags);
if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
p->file_slot = READ_ONCE(sqe->file_index);
p->nofile = rlimit(RLIMIT_NOFILE);
return 0;
}
static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
unsigned int issue_flags)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
struct io_ring_ctx *ctx = req->ctx;
int ret, fds[2] = { -1, -1 };
int slot = p->file_slot;
if (p->flags & O_CLOEXEC)
return -EINVAL;
io_ring_submit_lock(ctx, issue_flags);
ret = __io_fixed_fd_install(ctx, files[0], slot);
if (ret < 0)
goto err;
fds[0] = ret;
files[0] = NULL;
/*
* If a specific slot is given, next one will be used for
* the write side.
*/
if (slot != IORING_FILE_INDEX_ALLOC)
slot++;
ret = __io_fixed_fd_install(ctx, files[1], slot);
if (ret < 0)
goto err;
fds[1] = ret;
files[1] = NULL;
io_ring_submit_unlock(ctx, issue_flags);
if (!copy_to_user(p->fds, fds, sizeof(fds)))
return 0;
ret = -EFAULT;
io_ring_submit_lock(ctx, issue_flags);
err:
if (fds[0] != -1)
io_fixed_fd_remove(ctx, fds[0]);
if (fds[1] != -1)
io_fixed_fd_remove(ctx, fds[1]);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
static int io_pipe_fd(struct io_kiocb *req, struct file **files)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
int ret, fds[2] = { -1, -1 };
ret = __get_unused_fd_flags(p->flags, p->nofile);
if (ret < 0)
goto err;
fds[0] = ret;
ret = __get_unused_fd_flags(p->flags, p->nofile);
if (ret < 0)
goto err;
fds[1] = ret;
if (!copy_to_user(p->fds, fds, sizeof(fds))) {
fd_install(fds[0], files[0]);
fd_install(fds[1], files[1]);
return 0;
}
ret = -EFAULT;
err:
if (fds[0] != -1)
put_unused_fd(fds[0]);
if (fds[1] != -1)
put_unused_fd(fds[1]);
return ret;
}
int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
struct file *files[2];
int ret;
ret = create_pipe_files(files, p->flags);
if (ret)
return ret;
files[0]->f_mode |= FMODE_NOWAIT;
files[1]->f_mode |= FMODE_NOWAIT;
if (!!p->file_slot)
ret = io_pipe_fixed(req, files, issue_flags);
else
ret = io_pipe_fd(req, files);
io_req_set_res(req, ret, 0);
if (!ret)
return IOU_COMPLETE;
req_set_fail(req);
if (files[0])
fput(files[0]);
if (files[1])
fput(files[1]);
return ret;
}

View File

@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_pipe(struct io_kiocb *req, unsigned int issue_flags);
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);

View File

@ -893,7 +893,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
if (ret > 0) {
io_req_set_res(req, ipt.result_mask, 0);
return IOU_OK;
return IOU_COMPLETE;
}
return ret ?: IOU_ISSUE_SKIP_COMPLETE;
}
@ -948,5 +948,5 @@ out:
}
/* complete update request, we're done with it */
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
int io_buffer_validate(struct iovec *iov)
int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
unsigned long tmp, base = (unsigned long)uaddr;
unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
/* arbitrary limit, but we need something */
if (ulen > SZ_1G || !ulen)
return -EFAULT;
if (check_add_overflow(base, acct_len, &tmp))
return -EOVERFLOW;
return 0;
}
static int io_buffer_validate(struct iovec *iov)
{
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
if (!iov->iov_len)
return -EFAULT;
/* arbitrary limit, but we need something */
if (iov->iov_len > SZ_1G)
return -EFAULT;
if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
return -EOVERFLOW;
return 0;
return io_validate_user_buf_range((unsigned long)iov->iov_base,
iov->iov_len);
}
static void io_release_ubuf(void *priv)
@ -497,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@ -685,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
int nr_folios = data->nr_folios;
unsigned nr_pages_left = *nr_pages;
unsigned nr_folios = data->nr_folios;
unsigned i, j;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
GFP_KERNEL);
new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
if (!new_array)
return false;
new_array[0] = compound_head(page_array[0]);
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
if (data->nr_pages_head > 1)
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
for (i = 0, j = 0; i < nr_folios; i++) {
struct page *p = compound_head(page_array[j]);
struct folio *folio = page_folio(p);
unsigned int nr;
j = data->nr_pages_head;
nr_pages_left -= data->nr_pages_head;
for (i = 1; i < nr_folios; i++) {
unsigned int nr_unpin;
WARN_ON_ONCE(i > 0 && p != page_array[j]);
new_array[i] = page_array[j];
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
data->nr_pages_mid - 1);
if (nr_unpin)
unpin_user_pages(&page_array[j+1], nr_unpin);
j += data->nr_pages_mid;
nr_pages_left -= data->nr_pages_mid;
nr = i ? data->nr_pages_mid : data->nr_pages_head;
nr = min(nr, nr_pages_left);
/* Drop all but one ref, the entire folio will remain pinned. */
if (nr > 1)
unpin_user_folio(folio, nr - 1);
j += nr;
nr_pages_left -= nr;
new_array[i] = p;
}
WARN_ON_ONCE(j != *nr_pages);
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
@ -1062,8 +1061,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
size_t offset;
int ret;
if (WARN_ON_ONCE(!imu))
return -EFAULT;
ret = validate_fixed_range(buf_addr, len, imu);
if (unlikely(ret))
return ret;
@ -1110,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
if (req->flags & REQ_F_BUF_NODE)
return req->buf_node;
req->flags |= REQ_F_BUF_NODE;
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
if (node)
io_req_assign_buf_node(req, node);
if (node) {
node->refs++;
req->buf_node = node;
io_ring_submit_unlock(ctx, issue_flags);
return node;
}
req->flags &= ~REQ_F_BUF_NODE;
io_ring_submit_unlock(ctx, issue_flags);
return node;
return NULL;
}
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,

View File

@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
int io_buffer_validate(struct iovec *iov);
int io_validate_user_buf_range(u64 uaddr, u64 ulen);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
@ -115,32 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
return true;
}
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
{
if (req->file_node) {
io_put_rsrc_node(req->ctx, req->file_node);
req->file_node = NULL;
}
if (req->flags & REQ_F_BUF_NODE) {
io_put_rsrc_node(req->ctx, req->buf_node);
req->buf_node = NULL;
}
}
static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
struct io_rsrc_node *node)
{
node->refs++;
*dst_node = node;
}
static inline void io_req_assign_buf_node(struct io_kiocb *req,
struct io_rsrc_node *node)
{
io_req_assign_rsrc_node(&req->buf_node, node);
req->flags |= REQ_F_BUF_NODE;
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

View File

@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
return io_import_vec(ddir, req, io, buf, sqe_len);
if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags);
buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
rw->addr = (unsigned long) buf;
@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_async_rw *io;
unsigned ioprio;
u64 attr_type_mask;
int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
io = req->async_data;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
/* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
io->buf_group = req->buf_index;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@ -658,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
io_req_io_end(req);
io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
return IOU_COMPLETE;
} else {
io_rw_done(req, ret);
}

View File

@ -16,6 +16,8 @@ struct io_async_rw {
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
unsigned buf_group;
/*
* wpq is for buffered io, while meta fields are used with
* direct io

View File

@ -103,7 +103,7 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -144,5 +144,5 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
void io_statx_cleanup(struct io_kiocb *req)

View File

@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
if (ret >= 0)
fsnotify_modify(req->file);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
data.hash = hash;
data.task = task;
data.free_work = io_wq_free_work;
data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());

View File

@ -35,6 +35,9 @@ struct io_timeout_rem {
bool ltimeout;
};
static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link);
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
raw_spin_lock_irq(&ctx->timeout_lock);
link = io_disarm_linked_timeout(req);
if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT)
link = __io_disarm_linked_timeout(req, req->link);
raw_spin_unlock_irq(&ctx->timeout_lock);
if (link)
io_req_queue_tw_complete(link, -ECANCELED);
@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req)
io_fail_links(req);
}
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link)
static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
@ -500,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
static int __io_timeout_prep(struct io_kiocb *req,

View File

@ -8,19 +8,6 @@ struct io_timeout_data {
u32 flags;
};
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link);
static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *link = req->link;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
return __io_disarm_linked_timeout(req, link);
return NULL;
}
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);

View File

@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
ret = do_ftruncate(req->file, ft->len, 1);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -3,13 +3,10 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring/cmd.h>
#include <linux/io_uring/net.h>
#include <linux/security.h>
#include <linux/nospec.h>
#include <net/sock.h>
#include <uapi/linux/io_uring.h>
#include <asm/ioctls.h>
#include "io_uring.h"
#include "alloc_cache.h"
@ -268,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_uring_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@ -278,6 +275,9 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
return -EINVAL;
return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
@ -292,6 +292,9 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
struct io_async_cmd *ac = req->async_data;
int ret;
if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
return -EINVAL;
ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
if (ret)
return ret;
@ -307,83 +310,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
const struct io_uring_sqe *sqe = cmd->sqe;
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
int optlen, optname, level, err;
void __user *optval;
level = READ_ONCE(sqe->level);
if (level != SOL_SOCKET)
return -EOPNOTSUPP;
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
optname = READ_ONCE(sqe->optname);
optlen = READ_ONCE(sqe->optlen);
err = do_sock_getsockopt(sock, compat, level, optname,
USER_SOCKPTR(optval),
KERNEL_SOCKPTR(&optlen));
if (err)
return err;
/* On success, return optlen */
return optlen;
}
static inline int io_uring_cmd_setsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
const struct io_uring_sqe *sqe = cmd->sqe;
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
int optname, optlen, level;
void __user *optval;
sockptr_t optval_s;
optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
optname = READ_ONCE(sqe->optname);
optlen = READ_ONCE(sqe->optlen);
level = READ_ONCE(sqe->level);
optval_s = USER_SOCKPTR(optval);
return do_sock_setsockopt(sock, compat, level, optname, optval_s,
optlen);
}
#if defined(CONFIG_NET)
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
switch (cmd->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_SIOCOUTQ:
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_GETSOCKOPT:
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
case SOCKET_URING_OP_SETSOCKOPT:
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
#endif

View File

@ -323,5 +323,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_getxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
return IOU_OK;
return IOU_COMPLETE;
}
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
return IOU_OK;
return IOU_COMPLETE;
}
static int __io_setxattr_prep(struct io_kiocb *req,
@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_setxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
return IOU_OK;
return IOU_COMPLETE;
}
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
return IOU_OK;
return IOU_COMPLETE;
}

View File

@ -26,27 +26,205 @@
#include "zcrx.h"
#include "rsrc.h"
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
return pp->mp_priv;
}
#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
{
struct net_iov_area *owner = net_iov_owner(niov);
return container_of(owner, struct io_zcrx_area, nia);
}
static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
return area->mem.pages[net_iov_idx(niov)];
}
static void io_release_dmabuf(struct io_zcrx_mem *mem)
{
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return;
if (mem->sgt)
dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
DMA_FROM_DEVICE);
if (mem->attach)
dma_buf_detach(mem->dmabuf, mem->attach);
if (mem->dmabuf)
dma_buf_put(mem->dmabuf);
mem->sgt = NULL;
mem->attach = NULL;
mem->dmabuf = NULL;
}
static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
struct io_zcrx_mem *mem,
struct io_uring_zcrx_area_reg *area_reg)
{
unsigned long off = (unsigned long)area_reg->addr;
unsigned long len = (unsigned long)area_reg->len;
unsigned long total_size = 0;
struct scatterlist *sg;
int dmabuf_fd = area_reg->dmabuf_fd;
int i, ret;
if (WARN_ON_ONCE(!ifq->dev))
return -EFAULT;
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
mem->is_dmabuf = true;
mem->dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(mem->dmabuf)) {
ret = PTR_ERR(mem->dmabuf);
mem->dmabuf = NULL;
goto err;
}
mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
if (IS_ERR(mem->attach)) {
ret = PTR_ERR(mem->attach);
mem->attach = NULL;
goto err;
}
mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
if (IS_ERR(mem->sgt)) {
ret = PTR_ERR(mem->sgt);
mem->sgt = NULL;
goto err;
}
for_each_sgtable_dma_sg(mem->sgt, sg, i)
total_size += sg_dma_len(sg);
if (total_size < off + len)
return -EINVAL;
mem->dmabuf_offset = off;
mem->size = len;
return 0;
err:
io_release_dmabuf(mem);
return ret;
}
static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
unsigned long off = area->mem.dmabuf_offset;
struct scatterlist *sg;
unsigned i, niov_idx = 0;
if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
return -EINVAL;
for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
dma_addr_t dma = sg_dma_address(sg);
unsigned long sg_len = sg_dma_len(sg);
unsigned long sg_off = min(sg_len, off);
off -= sg_off;
sg_len -= sg_off;
dma += sg_off;
while (sg_len && niov_idx < area->nia.num_niovs) {
struct net_iov *niov = &area->nia.niovs[niov_idx];
if (net_mp_niov_set_dma_addr(niov, dma))
return 0;
sg_len -= PAGE_SIZE;
dma += PAGE_SIZE;
niov_idx++;
}
}
return niov_idx;
}
static int io_import_umem(struct io_zcrx_ifq *ifq,
struct io_zcrx_mem *mem,
struct io_uring_zcrx_area_reg *area_reg)
{
struct page **pages;
int nr_pages;
if (area_reg->dmabuf_fd)
return -EINVAL;
if (!area_reg->addr)
return -EFAULT;
pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
&nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
mem->pages = pages;
mem->nr_folios = nr_pages;
mem->size = area_reg->len;
return 0;
}
static void io_release_area_mem(struct io_zcrx_mem *mem)
{
if (mem->is_dmabuf) {
io_release_dmabuf(mem);
return;
}
if (mem->pages) {
unpin_user_pages(mem->pages, mem->nr_folios);
kvfree(mem->pages);
}
}
static int io_import_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_mem *mem,
struct io_uring_zcrx_area_reg *area_reg)
{
int ret;
ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
if (ret)
return ret;
if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
return -EINVAL;
if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
return io_import_dmabuf(ifq, mem, area_reg);
return io_import_umem(ifq, mem, area_reg);
}
static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
struct io_zcrx_area *area, int nr_mapped)
{
int i;
for (i = 0; i < nr_mapped; i++) {
netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
}
}
static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area *area, int nr_mapped)
{
int i;
for (i = 0; i < nr_mapped; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
if (area->mem.is_dmabuf)
io_release_dmabuf(&area->mem);
else
io_zcrx_unmap_umem(ifq, area, nr_mapped);
dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
net_mp_niov_set_dma_addr(niov, 0);
}
for (i = 0; i < area->nia.num_niovs; i++)
net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
}
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
@ -58,20 +236,16 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are
area->is_mapped = false;
}
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int i;
guard(mutex)(&ifq->dma_lock);
if (area->is_mapped)
return 0;
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
if (dma_mapping_error(ifq->dev, dma))
break;
if (net_mp_niov_set_dma_addr(niov, dma)) {
@ -80,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
break;
}
}
return i;
}
if (i != area->nia.num_niovs) {
__io_zcrx_unmap_area(ifq, area, i);
static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
unsigned nr;
guard(mutex)(&ifq->dma_lock);
if (area->is_mapped)
return 0;
if (area->mem.is_dmabuf)
nr = io_zcrx_map_area_dmabuf(ifq, area);
else
nr = io_zcrx_map_area_umem(ifq, area);
if (nr != area->nia.num_niovs) {
__io_zcrx_unmap_area(ifq, area, nr);
return -EINVAL;
}
@ -118,13 +307,6 @@ struct io_zcrx_args {
static const struct memory_provider_ops io_uring_pp_zc_ops;
static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
{
struct net_iov_area *owner = net_iov_owner(niov);
return container_of(owner, struct io_zcrx_area, nia);
}
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@ -147,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
return area->pages[net_iov_idx(niov)];
}
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_region_desc *rd)
struct io_uring_region_desc *rd,
u32 id)
{
u64 mmap_offset;
size_t off, size;
void *ptr;
int ret;
@ -167,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
if (size > rd->size)
return -EINVAL;
ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
IORING_MAP_OFF_ZCRX_REGION);
mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
mmap_offset += id << IORING_OFF_PBUF_SHIFT;
ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
if (ret < 0)
return ret;
ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
@ -180,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
io_free_region(ifq->ctx, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
@ -188,53 +367,44 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
io_zcrx_unmap_area(area->ifq, area);
io_release_area_mem(&area->mem);
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
if (area->pages) {
unpin_user_pages(area->pages, area->nr_folios);
kvfree(area->pages);
}
kfree(area);
}
#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area **res,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
int i, ret, nr_pages, nr_iovs;
struct iovec iov;
unsigned nr_iovs;
int i, ret;
if (area_reg->flags || area_reg->rq_area_token)
if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
return -EINVAL;
if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
if (area_reg->rq_area_token)
return -EINVAL;
if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
if (area_reg->__resv2[0] || area_reg->__resv2[1])
return -EINVAL;
iov.iov_base = u64_to_user_ptr(area_reg->addr);
iov.iov_len = area_reg->len;
ret = io_buffer_validate(&iov);
if (ret)
return ret;
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
&nr_pages);
if (IS_ERR(area->pages)) {
ret = PTR_ERR(area->pages);
area->pages = NULL;
ret = io_import_area(ifq, &area->mem, area_reg);
if (ret)
goto err;
}
area->nr_folios = nr_iovs = nr_pages;
nr_iovs = area->mem.size >> PAGE_SHIFT;
area->nia.num_niovs = nr_iovs;
ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
@ -245,9 +415,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
if (!area->freelist)
goto err;
for (i = 0; i < nr_iovs; i++)
area->freelist[i] = i;
area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->user_refs)
@ -341,6 +508,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
kfree(ifq);
}
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
lockdep_assert_held(&ctx->mmap_lock);
return ifq ? &ifq->region : NULL;
}
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@ -350,6 +527,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_region_desc rd;
struct io_zcrx_ifq *ifq;
int ret;
u32 id;
/*
* 1. Interface queue allocation.
@ -362,8 +540,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
ctx->flags & IORING_SETUP_CQE32))
return -EINVAL;
if (ctx->ifq)
return -EBUSY;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@ -386,29 +562,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
ifq->rq_entries = reg.rq_entries;
ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
scoped_guard(mutex, &ctx->mmap_lock) {
/* preallocate id */
ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
if (ret)
goto ifq_free;
}
ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
if (ret)
goto err;
ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
&ifq->netdev_tracker, GFP_KERNEL);
if (!ifq->netdev) {
ret = -ENODEV;
goto err;
}
ifq->dev = ifq->netdev->dev.parent;
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto err;
}
get_device(ifq->dev);
ret = io_zcrx_create_area(ifq, &ifq->area, &area);
if (ret)
goto err;
ifq->rq_entries = reg.rq_entries;
ret = -ENODEV;
ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
&ifq->netdev_tracker, GFP_KERNEL);
if (!ifq->netdev)
goto err;
ifq->dev = ifq->netdev->dev.parent;
ret = -EOPNOTSUPP;
if (!ifq->dev)
goto err;
get_device(ifq->dev);
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@ -419,6 +603,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
reg.offsets.rqes = sizeof(struct io_uring);
reg.offsets.head = offsetof(struct io_uring, head);
reg.offsets.tail = offsetof(struct io_uring, tail);
reg.zcrx_id = id;
scoped_guard(mutex, &ctx->mmap_lock) {
/* publish ifq */
ret = -ENOMEM;
if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
goto err;
}
if (copy_to_user(arg, &reg, sizeof(reg)) ||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@ -426,24 +618,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
ctx->ifq = ifq;
return 0;
err:
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->zcrx_ctxs, id);
ifq_free:
io_zcrx_ifq_free(ifq);
return ret;
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq = ctx->ifq;
struct io_zcrx_ifq *ifq;
unsigned long id;
lockdep_assert_held(&ctx->uring_lock);
if (!ifq)
return;
while (1) {
scoped_guard(mutex, &ctx->mmap_lock) {
ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
if (ifq)
xa_erase(&ctx->zcrx_ctxs, id);
}
if (!ifq)
break;
io_zcrx_ifq_free(ifq);
}
ctx->ifq = NULL;
io_zcrx_ifq_free(ifq);
xa_destroy(&ctx->zcrx_ctxs);
}
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@ -500,12 +702,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
unsigned long index;
lockdep_assert_held(&ctx->uring_lock);
if (!ctx->ifq)
return;
io_zcrx_scrub(ctx->ifq);
io_close_queue(ctx->ifq);
xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
io_zcrx_scrub(ifq);
io_close_queue(ifq);
}
}
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
@ -742,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
size_t copied = 0;
int ret = 0;
if (area->mem.is_dmabuf)
return -EFAULT;
while (len) {
size_t copy_size = min_t(size_t, PAGE_SIZE, len);
const int dst_off = 0;

View File

@ -3,10 +3,24 @@
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
#include <linux/dma-buf.h>
#include <linux/socket.h>
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
struct io_zcrx_mem {
unsigned long size;
bool is_dmabuf;
struct page **pages;
unsigned long nr_folios;
struct dma_buf_attachment *attach;
struct dma_buf *dmabuf;
struct sg_table *sgt;
unsigned long dmabuf_offset;
};
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
@ -14,13 +28,13 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
struct page **pages;
unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
u32 free_count;
u32 *freelist;
struct io_zcrx_mem mem;
};
struct io_zcrx_ifq {
@ -39,6 +53,7 @@ struct io_zcrx_ifq {
netdevice_tracker netdev_tracker;
spinlock_t lock;
struct mutex dma_lock;
struct io_mapped_region region;
};
#if defined(CONFIG_IO_URING_ZCRX)
@ -49,6 +64,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
@ -67,6 +84,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
{
return -EOPNOTSUPP;
}
static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
return NULL;
}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);