for-6.8/io_uring-2024-01-08

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmWcOk0QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpq2wEACgP1Gvfb2cR65B/6tkLJ6neKnYOPVSYx9F
 4Mv6KS306vmOsE67LqynhtAe/Hgfv0NKADN+mKLLV8IdwhneLAwFxRoHl8QPO2dW
 DIxohDMLIqzY/SOUBqBHMS8b5lEr79UVh8vXjAuuYCBTcHBndi4hj0S0zkwf5YiK
 Vma6ty+TnzmOv+c6+OCgZZlwFhkxD1pQBM5iOlFRAkdt3Er/2e8FqthK0/od7Fgy
 Ii9xdxYZU9KTsM4R+IWhqZ1rj1qUdtMPSGzFRbzFt3b6NdANRZT9MUlxvSkuHPIE
 P/9anpmgu41gd2JbHuyVnw+4jdZMhhZUPUYtphmQ5n35rcFZjv1gnJalH/Nw/DTE
 78bDxxP0+35bkuq5MwHfvA3imVsGnvnFx5MlZBoqvv+VK4S3Q6E2+ARQZdiG+2Is
 8Uyjzzt0nq2waUO3H1JLkgiM9J9LFqaYQLE68u569m4NCfRSpoZva9KVmNpre2K0
 9WdGy2gfzaYAQkGud2TULzeLiEbWfvZAL0o43jYXkVPRKmnffCEphf2X6C2IDV/3
 5ZR4bandRRC6DVnE+8n1MB06AYtpJPB/w3rplON+l/V5Gnb7wRNwiUrBr/F15OOy
 OPbAnP6k56wY/JRpgzdbNqwGi8EvGWX6t3Kqjp2mczhs2as8M193FKS2xppl1TYl
 BPyTsAfbdQ==
 =5v/U
 -----END PGP SIGNATURE-----

Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "Mostly just come fixes and cleanups, but one feature as well. In
  detail:

   - Harden the check for handling IOPOLL based on return (Pavel)

   - Various minor optimizations (Pavel)

   - Drop remnants of SCM_RIGHTS fd passing support, now that it's no
     longer supported since 6.7 (me)

   - Fix for a case where bytes_done wasn't initialized properly on a
     failure condition for read/write requests (me)

   - Move the register related code to a separate file (me)

   - Add support for returning the provided ring buffer head (me)

   - Add support for adding a direct descriptor to the normal file table
     (me, Christian Brauner)

   - Fix for ensuring pending task_work for a ring with DEFER_TASKRUN is
     run even if we timeout waiting (me)"

* tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux:
  io_uring: ensure local task_work is run on wait timeout
  io_uring/kbuf: add method for returning provided buffer ring head
  io_uring/rw: ensure io->bytes_done is always initialized
  io_uring: drop any code related to SCM_RIGHTS
  io_uring/unix: drop usage of io_uring socket
  io_uring/register: move io_uring_register(2) related code to register.c
  io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL
  io_uring/cmd: inline io_uring_cmd_get_task
  io_uring/cmd: inline io_uring_cmd_do_in_task_lazy
  io_uring: split out cmd api into a separate header
  io_uring: optimise ltimeout for inline execution
  io_uring: don't check iopoll if request completes
This commit is contained in:
Linus Torvalds 2024-01-11 14:19:23 -08:00
commit 4c72e2b8c4
26 changed files with 890 additions and 947 deletions

View file

@ -11142,6 +11142,7 @@ L: io-uring@vger.kernel.org
S: Maintained S: Maintained
T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing T: git git://git.kernel.dk/liburing
F: include/linux/io_uring/
F: include/linux/io_uring.h F: include/linux/io_uring.h
F: include/linux/io_uring_types.h F: include/linux/io_uring_types.h
F: include/trace/events/io_uring.h F: include/trace/events/io_uring.h

View file

@ -36,7 +36,7 @@
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/cdev.h> #include <linux/cdev.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/mm.h> #include <linux/mm.h>

View file

@ -5,7 +5,7 @@
*/ */
#include <linux/ptrace.h> /* for force_successful_syscall_return */ #include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h> #include <linux/nvme_ioctl.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include "nvme.h" #include "nvme.h"
enum { enum {

View file

@ -6,66 +6,13 @@
#include <linux/xarray.h> #include <linux/xarray.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
enum io_uring_cmd_flags {
IO_URING_F_COMPLETE_DEFER = 1,
IO_URING_F_UNLOCKED = 2,
/* the request is executed from poll, it should not be freed */
IO_URING_F_MULTISHOT = 4,
/* executed by io-wq */
IO_URING_F_IOWQ = 8,
/* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN,
/* ctx state flags, for URING_CMD */
IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = (1 << 10),
/* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11),
IO_URING_F_COMPAT = (1 << 12),
};
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
struct io_uring_cmd {
struct file *file;
const struct io_uring_sqe *sqe;
/* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
u32 cmd_op;
u32 flags;
u8 pdu[32]; /* available inline for free use */
};
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
{
return sqe->cmd;
}
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags);
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all); void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk); void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void); void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode); const char *io_uring_get_opcode(u8 opcode);
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
void (*task_work_cb)(struct io_uring_cmd *, unsigned), bool io_is_uring_fops(struct file *file);
unsigned flags);
/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned));
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
}
static inline void io_uring_files_cancel(void) static inline void io_uring_files_cancel(void)
{ {
@ -84,32 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring) if (tsk->io_uring)
__io_uring_free(tsk); __io_uring_free(tsk);
} }
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
#else #else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
{
return -EOPNOTSUPP;
}
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
ssize_t ret2, unsigned issue_flags)
{
}
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;
}
static inline void io_uring_task_cancel(void) static inline void io_uring_task_cancel(void)
{ {
} }
@ -128,13 +50,9 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline bool io_is_uring_fops(struct file *file)
unsigned int issue_flags)
{ {
} return false;
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return NULL;
} }
#endif #endif

View file

@ -0,0 +1,77 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_IO_URING_CMD_H
#define _LINUX_IO_URING_CMD_H
#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
struct io_uring_cmd {
struct file *file;
const struct io_uring_sqe *sqe;
/* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
u32 cmd_op;
u32 flags;
u8 pdu[32]; /* available inline for free use */
};
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
{
return sqe->cmd;
}
#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags);
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags);
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
{
return -EOPNOTSUPP;
}
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
ssize_t ret2, unsigned issue_flags)
{
}
static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags)
{
}
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
}
#endif
/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
}
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
}
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return cmd_to_io_kiocb(cmd)->task;
}
#endif /* _LINUX_IO_URING_CMD_H */

View file

@ -7,6 +7,37 @@
#include <linux/llist.h> #include <linux/llist.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
enum {
/*
* A hint to not wake right away but delay until there are enough of
* tw's queued to match the number of CQEs the task is waiting for.
*
* Must not be used wirh requests generating more than one CQE.
* It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
*/
IOU_F_TWQ_LAZY_WAKE = 1,
};
enum io_uring_cmd_flags {
IO_URING_F_COMPLETE_DEFER = 1,
IO_URING_F_UNLOCKED = 2,
/* the request is executed from poll, it should not be freed */
IO_URING_F_MULTISHOT = 4,
/* executed by io-wq */
IO_URING_F_IOWQ = 8,
/* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN,
/* ctx state flags, for URING_CMD */
IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = (1 << 10),
/* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11),
IO_URING_F_COMPAT = (1 << 12),
};
struct io_wq_work_node { struct io_wq_work_node {
struct io_wq_work_node *next; struct io_wq_work_node *next;
}; };
@ -358,9 +389,6 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq; struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce; unsigned rsrc_quiesce;
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
/* hashed buffered write serialization */ /* hashed buffered write serialization */
struct io_wq_hash *hash_map; struct io_wq_hash *hash_map;

View file

@ -71,6 +71,7 @@ struct io_uring_sqe {
__u32 uring_cmd_flags; __u32 uring_cmd_flags;
__u32 waitid_flags; __u32 waitid_flags;
__u32 futex_flags; __u32 futex_flags;
__u32 install_fd_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */ /* pack this to avoid bogus arm OABI complaints */
@ -253,6 +254,7 @@ enum io_uring_op {
IORING_OP_FUTEX_WAIT, IORING_OP_FUTEX_WAIT,
IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV, IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
@ -386,6 +388,13 @@ enum {
/* Pass through the flags from sqe->file_index to cqe->flags */ /* Pass through the flags from sqe->file_index to cqe->flags */
#define IORING_MSG_RING_FLAGS_PASS (1U << 1) #define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/*
* IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags)
*
* IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC
*/
#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0)
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
@ -558,6 +567,9 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */ /* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25, IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* return status information for a buffer group */
IORING_REGISTER_PBUF_STATUS = 26,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST, IORING_REGISTER_LAST,
@ -684,6 +696,13 @@ struct io_uring_buf_reg {
__u64 resv[3]; __u64 resv[3];
}; };
/* argument for IORING_REGISTER_PBUF_STATUS */
struct io_uring_buf_status {
__u32 buf_group; /* input */
__u32 head; /* output */
__u32 resv[8];
};
/* /*
* io_uring_restriction->opcode values * io_uring_restriction->opcode values
*/ */

View file

@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \ statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \ sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o notif.o waitid.o register.o
obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_FUTEX) += futex.o

View file

@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_file_bitmap_clear(&ctx->file_table, slot_index); io_file_bitmap_clear(&ctx->file_table, slot_index);
} }
ret = io_scm_file_account(ctx, file); *io_get_tag_slot(ctx->file_data, slot_index) = 0;
if (!ret) { io_fixed_file_set(file_slot, file);
*io_get_tag_slot(ctx->file_data, slot_index) = 0; io_file_bitmap_set(&ctx->file_table, slot_index);
io_fixed_file_set(file_slot, file); return 0;
io_file_bitmap_set(&ctx->file_table, slot_index);
}
return ret;
} }
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,

View file

@ -60,7 +60,6 @@
#include <linux/net.h> #include <linux/net.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/af_unix.h> #include <net/af_unix.h>
#include <net/scm.h>
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
@ -70,6 +69,7 @@
#include <linux/fadvise.h> #include <linux/fadvise.h>
#include <linux/task_work.h> #include <linux/task_work.h>
#include <linux/io_uring.h> #include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/security.h> #include <linux/security.h>
#include <asm/shmparam.h> #include <asm/shmparam.h>
@ -85,6 +85,7 @@
#include "opdef.h" #include "opdef.h"
#include "refs.h" #include "refs.h"
#include "tctx.h" #include "tctx.h"
#include "register.h"
#include "sqpoll.h" #include "sqpoll.h"
#include "fdinfo.h" #include "fdinfo.h"
#include "kbuf.h" #include "kbuf.h"
@ -103,9 +104,6 @@
#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC) IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@ -129,11 +127,6 @@ enum {
IO_CHECK_CQ_DROPPED_BIT, IO_CHECK_CQ_DROPPED_BIT,
}; };
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
};
struct io_defer_entry { struct io_defer_entry {
struct list_head list; struct list_head list;
struct io_kiocb *req; struct io_kiocb *req;
@ -177,19 +170,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = {
}; };
#endif #endif
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
if (io_is_uring_fops(file)) {
struct io_ring_ctx *ctx = file->private_data;
return ctx->ring_sock->sk;
}
#endif
return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{ {
if (!wq_list_empty(&ctx->submit_state.compl_reqs) || if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
@ -554,8 +534,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
} }
} }
void io_eventfd_ops(struct rcu_head *rcu)
static void io_eventfd_ops(struct rcu_head *rcu)
{ {
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0); int ops = atomic_xchg(&ev_fd->ops, 0);
@ -1898,14 +1877,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
io_req_complete_defer(req); io_req_complete_defer(req);
else else
io_req_complete_post(req, issue_flags); io_req_complete_post(req, issue_flags);
} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
return ret;
/* If the op doesn't have a file, we're not polling for it */ return 0;
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) }
io_iopoll_req_issued(req, issue_flags);
return 0; if (ret == IOU_ISSUE_SKIP_COMPLETE) {
ret = 0;
io_arm_ltimeout(req);
/* If the op doesn't have a file, we're not polling for it */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
io_iopoll_req_issued(req, issue_flags);
}
return ret;
} }
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
@ -2076,9 +2060,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
* We async punt it if the file wasn't marked NOWAIT, or if the file * We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts * doesn't support non-blocking read/write attempts
*/ */
if (likely(!ret)) if (unlikely(ret))
io_arm_ltimeout(req);
else
io_queue_async(req, ret); io_queue_async(req, ret);
} }
@ -2633,8 +2615,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, 0); atomic_set(&ctx->cq_wait_nr, 0);
if (ret < 0)
break;
/* /*
* Run task_work after scheduling and before io_should_wake(). * Run task_work after scheduling and before io_should_wake().
* If we got woken because of task_work being processed, run it * If we got woken because of task_work being processed, run it
@ -2644,6 +2624,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
if (!llist_empty(&ctx->work_llist)) if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx); io_run_local_work(ctx);
/*
* Non-local task_work will be run on exit to userspace, but
* if we're using DEFER_TASKRUN, then we could have waited
* with a timeout for a number of requests. If the timeout
* hits, we could have some requests ready to process. Ensure
* this break is _after_ we have run task_work, to avoid
* deferring running potentially pending requests until the
* next time we wait for events.
*/
if (ret < 0)
break;
check_cq = READ_ONCE(ctx->check_cq); check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) { if (unlikely(check_cq)) {
/* let the caller flush overflows, retry */ /* let the caller flush overflows, retry */
@ -2831,61 +2823,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return off; return off;
} }
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
return 0;
}
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
call_rcu(&ev_fd->rcu, io_eventfd_ops);
return 0;
}
return -ENXIO;
}
static void io_req_caches_free(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx)
{ {
struct io_kiocb *req; struct io_kiocb *req;
@ -2938,13 +2875,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_rsrc_node_destroy(ctx, ctx->rsrc_node); io_rsrc_node_destroy(ctx, ctx->rsrc_node);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
ctx->ring_sock->file = NULL; /* so that iput() is called */
sock_release(ctx->ring_sock);
}
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
@ -2984,7 +2914,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb)
percpu_ref_put(&ctx->refs); percpu_ref_put(&ctx->refs);
} }
static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{ {
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
/* already activated or in progress */ /* already activated or in progress */
@ -3043,19 +2973,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
return mask; return mask;
} }
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
creds = xa_erase(&ctx->personalities, id);
if (creds) {
put_cred(creds);
return 0;
}
return -EINVAL;
}
struct io_tctx_exit { struct io_tctx_exit {
struct callback_head task_work; struct callback_head task_work;
struct completion completion; struct completion completion;
@ -3866,32 +3783,12 @@ static int io_uring_install_fd(struct file *file)
/* /*
* Allocate an anonymous fd, this is what constitutes the application * Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this * visible backing of an io_uring instance. The application mmaps this
* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, * fd to gain access to the SQ/CQ ring details.
* we have to tie this fd to a socket for file garbage collection purposes.
*/ */
static struct file *io_uring_get_file(struct io_ring_ctx *ctx) static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{ {
struct file *file; return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
#if defined(CONFIG_UNIX)
int ret;
ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
&ctx->ring_sock);
if (ret)
return ERR_PTR(ret);
#endif
file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL); O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
ctx->ring_sock = NULL;
} else {
ctx->ring_sock->file = file;
}
#endif
return file;
} }
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
@ -4158,506 +4055,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params); return io_uring_setup(entries, params);
} }
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_issue_defs[i].not_supported)
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds;
u32 id;
int ret;
creds = get_current_cred();
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
if (ret < 0) {
put_cred(creds);
return ret;
}
return id;
}
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
int i, ret;
/* Restrictions allowed only if rings started disabled */
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
/* We allow only a single restrictions registration */
if (ctx->restrictions.registered)
return -EBUSY;
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
return -EINVAL;
size = array_size(nr_args, sizeof(*res));
if (size == SIZE_MAX)
return -EOVERFLOW;
res = memdup_user(arg, size);
if (IS_ERR(res))
return PTR_ERR(res);
ret = 0;
for (i = 0; i < nr_args; i++) {
switch (res[i].opcode) {
case IORING_RESTRICTION_REGISTER_OP:
if (res[i].register_op >= IORING_REGISTER_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].register_op,
ctx->restrictions.register_op);
break;
case IORING_RESTRICTION_SQE_OP:
if (res[i].sqe_op >= IORING_OP_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
break;
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
break;
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
break;
default:
ret = -EINVAL;
goto out;
}
}
out:
/* Reset all restrictions if an error happened */
if (ret != 0)
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
else
ctx->restrictions.registered = true;
kfree(res);
return ret;
}
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
/*
* Lazy activation attempts would fail if it was polled before
* submitter_task is set.
*/
if (wq_has_sleeper(&ctx->poll_wq))
io_activate_pollwq(ctx);
}
if (ctx->restrictions.registered)
ctx->restricted = 1;
ctx->flags &= ~IORING_SETUP_R_DISABLED;
if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
return 0;
}
static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
cpumask_var_t new_mask)
{
int ret;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_wq_cpu_affinity(current->io_uring, new_mask);
} else {
mutex_unlock(&ctx->uring_lock);
ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
mutex_lock(&ctx->uring_lock);
}
return ret;
}
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len)
{
cpumask_var_t new_mask;
int ret;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
cpumask_clear(new_mask);
if (len > cpumask_size())
len = cpumask_size();
if (in_compat_syscall()) {
ret = compat_get_bitmap(cpumask_bits(new_mask),
(const compat_ulong_t __user *)arg,
len * 8 /* CHAR_BIT */);
} else {
ret = copy_from_user(new_mask, arg, len);
}
if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
ret = __io_register_iowq_aff(ctx, new_mask);
free_cpumask_var(new_mask);
return ret;
}
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
return __io_register_iowq_aff(ctx, NULL);
}
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
if (sqd->thread)
tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
goto err;
} else {
memset(new_count, 0, sizeof(new_count));
}
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
/* that's it for SQPOLL, only the SQPOLL task creates requests */
if (sqd)
return 0;
/* now propagate the restriction to all registered users */
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
if (WARN_ON_ONCE(!tctx->io_wq))
continue;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
new_count[i] = ctx->iowq_limits[i];
/* ignore errors, it always returns zero anyway */
(void)io_wq_max_workers(tctx->io_wq, new_count);
}
return 0;
err:
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
return ret;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;
/*
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
if (ctx->restricted) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_FILES_UPDATE:
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 0);
break;
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_eventfd_unregister(ctx);
break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
case IORING_REGISTER_ENABLE_RINGS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_enable_rings(ctx);
break;
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
case IORING_REGISTER_FILES2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
break;
case IORING_REGISTER_FILES_UPDATE2:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_FILE);
break;
case IORING_REGISTER_BUFFERS2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_BUFFERS_UPDATE:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_IOWQ_AFF:
ret = -EINVAL;
if (!arg || !nr_args)
break;
ret = io_register_iowq_aff(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_IOWQ_AFF:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_unregister_iowq_aff(ctx);
break;
case IORING_REGISTER_IOWQ_MAX_WORKERS:
ret = -EINVAL;
if (!arg || nr_args != 2)
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
case IORING_REGISTER_RING_FDS:
ret = io_ringfd_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
case IORING_REGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_ring(ctx, arg);
break;
case IORING_UNREGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_unregister_pbuf_ring(ctx, arg);
break;
case IORING_REGISTER_SYNC_CANCEL:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_sync_cancel(ctx, arg);
break;
case IORING_REGISTER_FILE_ALLOC_RANGE:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
if (use_registered_ring) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(file))
goto out_fput;
}
ctx = file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
if (!use_registered_ring)
fput(file);
return ret;
}
static int __init io_uring_init(void) static int __init io_uring_init(void)
{ {
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \

View file

@ -15,16 +15,6 @@
#include <trace/events/io_uring.h> #include <trace/events/io_uring.h>
#endif #endif
enum {
/*
* A hint to not wake right away but delay until there are enough of
* tw's queued to match the number of CQEs the task is waiting for.
*
* Must not be used wirh requests generating more than one CQE.
* It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
*/
IOU_F_TWQ_LAZY_WAKE = 1,
};
enum { enum {
IOU_OK = 0, IOU_OK = 0,
@ -54,7 +44,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags); unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req); bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
@ -89,6 +78,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
void *io_mem_alloc(size_t size); void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr); void io_mem_free(void *ptr);
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
};
void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx);
#if defined(CONFIG_PROVE_LOCKING) #if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{ {

View file

@ -750,6 +750,32 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
int i;
if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
if (buf_status.resv[i])
return -EINVAL;
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)
return -ENOENT;
if (!bl->is_mapped)
return -EINVAL;
buf_status.head = bl->head;
if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
return -EFAULT;
return 0;
}
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{ {
struct io_buffer_list *bl; struct io_buffer_list *bl;

View file

@ -53,6 +53,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);

View file

@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_FIXED_FD_INSTALL] = {
.needs_file = 1,
.audit_skip = 1,
.prep = io_install_fixed_fd_prep,
.issue = io_install_fixed_fd,
},
}; };
const struct io_cold_def io_cold_defs[] = { const struct io_cold_def io_cold_defs[] = {
@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FUTEX_WAITV] = { [IORING_OP_FUTEX_WAITV] = {
.name = "FUTEX_WAITV", .name = "FUTEX_WAITV",
}, },
[IORING_OP_FIXED_FD_INSTALL] = {
.name = "FIXED_FD_INSTALL",
},
}; };
const char *io_uring_get_opcode(u8 opcode) const char *io_uring_get_opcode(u8 opcode)

View file

@ -31,6 +31,11 @@ struct io_close {
u32 file_slot; u32 file_slot;
}; };
struct io_fixed_install {
struct file *file;
unsigned int o_flags;
};
static bool io_openat_force_async(struct io_open *open) static bool io_openat_force_async(struct io_open *open)
{ {
/* /*
@ -254,3 +259,42 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
return IOU_OK; return IOU_OK;
} }
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fixed_install *ifi;
unsigned int flags;
if (sqe->off || sqe->addr || sqe->len || sqe->buf_index ||
sqe->splice_fd_in || sqe->addr3)
return -EINVAL;
/* must be a fixed file */
if (!(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
flags = READ_ONCE(sqe->install_fd_flags);
if (flags & ~IORING_FIXED_FD_NO_CLOEXEC)
return -EINVAL;
/* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */
ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
ifi->o_flags = O_CLOEXEC;
if (flags & IORING_FIXED_FD_NO_CLOEXEC)
ifi->o_flags = 0;
return 0;
}
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_fixed_install *ifi;
int ret;
ifi = io_kiocb_to_cmd(req, struct io_fixed_install);
ret = receive_fd(req->file, NULL, ifi->o_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

View file

@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags); int io_close(struct io_kiocb *req, unsigned int issue_flags);
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);

605
io_uring/register.c Normal file
View file

@ -0,0 +1,605 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Code related to the io_uring_register() syscall
*
* Copyright (C) 2023 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/refcount.h>
#include <linux/bits.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include "io_uring.h"
#include "opdef.h"
#include "tctx.h"
#include "rsrc.h"
#include "sqpoll.h"
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
call_rcu(&ev_fd->rcu, io_eventfd_ops);
return 0;
}
return -ENXIO;
}
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_issue_defs[i].not_supported)
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
creds = xa_erase(&ctx->personalities, id);
if (creds) {
put_cred(creds);
return 0;
}
return -EINVAL;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds;
u32 id;
int ret;
creds = get_current_cred();
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
if (ret < 0) {
put_cred(creds);
return ret;
}
return id;
}
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
int i, ret;
/* Restrictions allowed only if rings started disabled */
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
/* We allow only a single restrictions registration */
if (ctx->restrictions.registered)
return -EBUSY;
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
return -EINVAL;
size = array_size(nr_args, sizeof(*res));
if (size == SIZE_MAX)
return -EOVERFLOW;
res = memdup_user(arg, size);
if (IS_ERR(res))
return PTR_ERR(res);
ret = 0;
for (i = 0; i < nr_args; i++) {
switch (res[i].opcode) {
case IORING_RESTRICTION_REGISTER_OP:
if (res[i].register_op >= IORING_REGISTER_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].register_op,
ctx->restrictions.register_op);
break;
case IORING_RESTRICTION_SQE_OP:
if (res[i].sqe_op >= IORING_OP_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
break;
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
break;
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
break;
default:
ret = -EINVAL;
goto out;
}
}
out:
/* Reset all restrictions if an error happened */
if (ret != 0)
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
else
ctx->restrictions.registered = true;
kfree(res);
return ret;
}
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
/*
* Lazy activation attempts would fail if it was polled before
* submitter_task is set.
*/
if (wq_has_sleeper(&ctx->poll_wq))
io_activate_pollwq(ctx);
}
if (ctx->restrictions.registered)
ctx->restricted = 1;
ctx->flags &= ~IORING_SETUP_R_DISABLED;
if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
return 0;
}
static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
cpumask_var_t new_mask)
{
int ret;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_wq_cpu_affinity(current->io_uring, new_mask);
} else {
mutex_unlock(&ctx->uring_lock);
ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
mutex_lock(&ctx->uring_lock);
}
return ret;
}
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len)
{
cpumask_var_t new_mask;
int ret;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
cpumask_clear(new_mask);
if (len > cpumask_size())
len = cpumask_size();
if (in_compat_syscall()) {
ret = compat_get_bitmap(cpumask_bits(new_mask),
(const compat_ulong_t __user *)arg,
len * 8 /* CHAR_BIT */);
} else {
ret = copy_from_user(new_mask, arg, len);
}
if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
ret = __io_register_iowq_aff(ctx, new_mask);
free_cpumask_var(new_mask);
return ret;
}
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
return __io_register_iowq_aff(ctx, NULL);
}
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
if (sqd->thread)
tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
goto err;
} else {
memset(new_count, 0, sizeof(new_count));
}
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
/* that's it for SQPOLL, only the SQPOLL task creates requests */
if (sqd)
return 0;
/* now propagate the restriction to all registered users */
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
if (WARN_ON_ONCE(!tctx->io_wq))
continue;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
new_count[i] = ctx->iowq_limits[i];
/* ignore errors, it always returns zero anyway */
(void)io_wq_max_workers(tctx->io_wq, new_count);
}
return 0;
err:
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
return ret;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;
/*
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
if (ctx->restricted) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_FILES_UPDATE:
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 0);
break;
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_eventfd_unregister(ctx);
break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
case IORING_REGISTER_ENABLE_RINGS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_enable_rings(ctx);
break;
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
case IORING_REGISTER_FILES2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
break;
case IORING_REGISTER_FILES_UPDATE2:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_FILE);
break;
case IORING_REGISTER_BUFFERS2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_BUFFERS_UPDATE:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_IOWQ_AFF:
ret = -EINVAL;
if (!arg || !nr_args)
break;
ret = io_register_iowq_aff(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_IOWQ_AFF:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_unregister_iowq_aff(ctx);
break;
case IORING_REGISTER_IOWQ_MAX_WORKERS:
ret = -EINVAL;
if (!arg || nr_args != 2)
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
case IORING_REGISTER_RING_FDS:
ret = io_ringfd_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
case IORING_REGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_ring(ctx, arg);
break;
case IORING_UNREGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_unregister_pbuf_ring(ctx, arg);
break;
case IORING_REGISTER_SYNC_CANCEL:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_sync_cancel(ctx, arg);
break;
case IORING_REGISTER_FILE_ALLOC_RANGE:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
case IORING_REGISTER_PBUF_STATUS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_status(ctx, arg);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
if (use_registered_ring) {
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(file))
goto out_fput;
}
ctx = file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
if (!use_registered_ring)
fput(file);
return ret;
}

8
io_uring/register.h Normal file
View file

@ -0,0 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IORING_REGISTER_H
#define IORING_REGISTER_H
int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
#endif

View file

@ -24,7 +24,6 @@ struct io_rsrc_update {
}; };
static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu, struct io_mapped_ubuf **pimu,
struct page **last_hpage); struct page **last_hpage);
@ -157,7 +156,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
switch (node->type) { switch (node->type) {
case IORING_RSRC_FILE: case IORING_RSRC_FILE:
io_rsrc_file_put(node->ctx, prsrc); fput(prsrc->file);
break; break;
case IORING_RSRC_BUFFER: case IORING_RSRC_BUFFER:
io_rsrc_buf_put(node->ctx, prsrc); io_rsrc_buf_put(node->ctx, prsrc);
@ -402,23 +401,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
break; break;
} }
/* /*
* Don't allow io_uring instances to be registered. If * Don't allow io_uring instances to be registered.
* UNIX isn't enabled, then this causes a reference
* cycle and this instance can never get freed. If UNIX
* is enabled we'll handle it just fine, but there's
* still no point in allowing a ring fd as it doesn't
* support regular read/write anyway.
*/ */
if (io_is_uring_fops(file)) { if (io_is_uring_fops(file)) {
fput(file); fput(file);
err = -EBADF; err = -EBADF;
break; break;
} }
err = io_scm_file_account(ctx, file);
if (err) {
fput(file);
break;
}
*io_get_tag_slot(data, i) = tag; *io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file); io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, i); io_file_bitmap_set(&ctx->file_table, i);
@ -675,22 +664,12 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
for (i = 0; i < ctx->nr_user_files; i++) { for (i = 0; i < ctx->nr_user_files; i++) {
struct file *file = io_file_from_index(&ctx->file_table, i); struct file *file = io_file_from_index(&ctx->file_table, i);
/* skip scm accounted files, they'll be freed by ->ring_sock */ if (!file)
if (!file || io_file_need_scm(file))
continue; continue;
io_file_bitmap_clear(&ctx->file_table, i); io_file_bitmap_clear(&ctx->file_table, i);
fput(file); fput(file);
} }
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff *skb;
while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
kfree_skb(skb);
}
#endif
io_free_file_tables(&ctx->file_table); io_free_file_tables(&ctx->file_table);
io_file_table_set_alloc_range(ctx, 0, 0); io_file_table_set_alloc_range(ctx, 0, 0);
io_rsrc_data_free(ctx->file_data); io_rsrc_data_free(ctx->file_data);
@ -718,137 +697,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx)
return ret; return ret;
} }
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have
* loops in the file referencing. We account only files that can hold other
* files because otherwise they can't form a loop and so are not interesting
* for GC.
*/
int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
{
#if defined(CONFIG_UNIX)
struct sock *sk = ctx->ring_sock->sk;
struct sk_buff_head *head = &sk->sk_receive_queue;
struct scm_fp_list *fpl;
struct sk_buff *skb;
if (likely(!io_file_need_scm(file)))
return 0;
/*
* See if we can merge this file into an existing skb SCM_RIGHTS
* file set. If there's no room, fall back to allocating a new skb
* and filling it in.
*/
spin_lock_irq(&head->lock);
skb = skb_peek(head);
if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
__skb_unlink(skb, head);
else
skb = NULL;
spin_unlock_irq(&head->lock);
if (!skb) {
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
skb = alloc_skb(0, GFP_KERNEL);
if (!skb) {
kfree(fpl);
return -ENOMEM;
}
fpl->user = get_uid(current_user());
fpl->max = SCM_MAX_FD;
fpl->count = 0;
UNIXCB(skb).fp = fpl;
skb->sk = sk;
skb->destructor = io_uring_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
fpl = UNIXCB(skb).fp;
fpl->fp[fpl->count++] = get_file(file);
unix_inflight(fpl->user, file);
skb_queue_head(head, skb);
fput(file);
#endif
return 0;
}
static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
{
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
__skb_queue_head_init(&list);
/*
* Find the skb that holds this file in its SCM_RIGHTS. When found,
* remove this entry and rearrange the file array.
*/
skb = skb_dequeue(head);
while (skb) {
struct scm_fp_list *fp;
fp = UNIXCB(skb).fp;
for (i = 0; i < fp->count; i++) {
int left;
if (fp->fp[i] != file)
continue;
unix_notinflight(fp->user, fp->fp[i]);
left = fp->count - 1 - i;
if (left) {
memmove(&fp->fp[i], &fp->fp[i + 1],
left * sizeof(struct file *));
}
fp->count--;
if (!fp->count) {
kfree_skb(skb);
skb = NULL;
} else {
__skb_queue_tail(&list, skb);
}
fput(file);
file = NULL;
break;
}
if (!file)
break;
__skb_queue_tail(&list, skb);
skb = skb_dequeue(head);
}
if (skb_peek(&list)) {
spin_lock_irq(&head->lock);
while ((skb = __skb_dequeue(&list)) != NULL)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#endif
}
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
if (likely(!io_file_need_scm(file)))
fput(file);
else
io_rsrc_file_scm_put(ctx, file);
}
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags) unsigned nr_args, u64 __user *tags)
{ {
@ -897,21 +745,12 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail; goto fail;
/* /*
* Don't allow io_uring instances to be registered. If UNIX * Don't allow io_uring instances to be registered.
* isn't enabled, then this causes a reference cycle and this
* instance can never get freed. If UNIX is enabled we'll
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/ */
if (io_is_uring_fops(file)) { if (io_is_uring_fops(file)) {
fput(file); fput(file);
goto fail; goto fail;
} }
ret = io_scm_file_account(ctx, file);
if (ret) {
fput(file);
goto fail;
}
file_slot = io_fixed_file_slot(&ctx->file_table, i); file_slot = io_fixed_file_slot(&ctx->file_table, i);
io_fixed_file_set(file_slot, file); io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, i); io_file_bitmap_set(&ctx->file_table, i);

View file

@ -75,21 +75,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags); unsigned nr_args, u64 __user *tags);
int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
static inline bool io_file_need_scm(struct file *filp)
{
return false;
}
static inline int io_scm_file_account(struct io_ring_ctx *ctx,
struct file *file)
{
if (likely(!io_file_need_scm(file)))
return 0;
return __io_scm_file_account(ctx, file);
}
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args); unsigned nr_args);
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,

View file

@ -10,7 +10,7 @@
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/nospec.h> #include <linux/nospec.h>
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
@ -589,15 +589,19 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov; struct iovec *iov;
int ret; int ret;
iorw->bytes_done = 0;
iorw->free_iovec = NULL;
/* submission path, ->uring_lock should already be taken */ /* submission path, ->uring_lock should already be taken */
ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
iorw->bytes_done = 0; if (iov) {
iorw->free_iovec = iov; iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
}
return 0; return 0;
} }

View file

@ -2,7 +2,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/nospec.h> #include <linux/nospec.h>
@ -52,12 +52,6 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return cmd_to_io_kiocb(cmd)->task;
}
EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@ -78,13 +72,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
} }
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req, static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2) u64 extra1, u64 extra2)
{ {

View file

@ -105,7 +105,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (fd < 0 || !(file = fget_raw(fd))) if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF; return -EBADF;
/* don't allow io_uring files */ /* don't allow io_uring files */
if (io_uring_get_socket(file)) { if (io_is_uring_fops(file)) {
fput(file); fput(file);
return -EINVAL; return -EINVAL;
} }

View file

@ -35,10 +35,8 @@ struct sock *unix_get_socket(struct file *filp)
/* PF_UNIX ? */ /* PF_UNIX ? */
if (s && ops && ops->family == PF_UNIX) if (s && ops && ops->family == PF_UNIX)
u_sock = s; u_sock = s;
} else {
/* Could be an io_uring instance */
u_sock = io_uring_get_socket(filp);
} }
return u_sock; return u_sock;
} }
EXPORT_SYMBOL(unix_get_socket); EXPORT_SYMBOL(unix_get_socket);

View file

@ -92,7 +92,7 @@
#include <uapi/linux/mount.h> #include <uapi/linux/mount.h>
#include <linux/fsnotify.h> #include <linux/fsnotify.h>
#include <linux/fanotify.h> #include <linux/fanotify.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include <uapi/linux/lsm.h> #include <uapi/linux/lsm.h>
#include "avc.h" #include "avc.h"

View file

@ -43,7 +43,7 @@
#include <linux/fs_context.h> #include <linux/fs_context.h>
#include <linux/fs_parser.h> #include <linux/fs_parser.h>
#include <linux/watch_queue.h> #include <linux/watch_queue.h>
#include <linux/io_uring.h> #include <linux/io_uring/cmd.h>
#include <uapi/linux/lsm.h> #include <uapi/linux/lsm.h>
#include "smack.h" #include "smack.h"