kernel-hacking-2024-linux-s.../lib/closure.c

298 lines
6.8 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Asynchronous refcounty things
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
*/
#include <linux/closure.h>
#include <linux/debugfs.h>
#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/sched/debug.h>
static inline void closure_put_after_sub_checks(int flags)
{
int r = flags & CLOSURE_REMAINING_MASK;
closures: Change BUG_ON() to WARN_ON() If a BUG_ON() can be hit in the wild, it shouldn't be a BUG_ON() For reference, this has popped up once in the CI, and we'll need more info to debug it: 03240 ------------[ cut here ]------------ 03240 kernel BUG at lib/closure.c:21! 03240 kernel BUG at lib/closure.c:21! 03240 Internal error: Oops - BUG: 00000000f2000800 [#1] SMP 03240 Modules linked in: 03240 CPU: 15 PID: 40534 Comm: kworker/u80:1 Not tainted 6.10.0-rc4-ktest-ga56da69799bd #25570 03240 Hardware name: linux,dummy-virt (DT) 03240 Workqueue: btree_update btree_interior_update_work 03240 pstate: 00001005 (nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 03240 pc : closure_put+0x224/0x2a0 03240 lr : closure_put+0x24/0x2a0 03240 sp : ffff0000d12071c0 03240 x29: ffff0000d12071c0 x28: dfff800000000000 x27: ffff0000d1207360 03240 x26: 0000000000000040 x25: 0000000000000040 x24: 0000000000000040 03240 x23: ffff0000c1f20180 x22: 0000000000000000 x21: ffff0000c1f20168 03240 x20: 0000000040000000 x19: ffff0000c1f20140 x18: 0000000000000001 03240 x17: 0000000000003aa0 x16: 0000000000003ad0 x15: 1fffe0001c326974 03240 x14: 0000000000000a1e x13: 0000000000000000 x12: 1fffe000183e402d 03240 x11: ffff6000183e402d x10: dfff800000000000 x9 : ffff6000183e402e 03240 x8 : 0000000000000001 x7 : 00009fffe7c1bfd3 x6 : ffff0000c1f2016b 03240 x5 : ffff0000c1f20168 x4 : ffff6000183e402e x3 : ffff800081391954 03240 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 00000000a8000000 03240 Call trace: 03240 closure_put+0x224/0x2a0 03240 bch2_check_for_deadlock+0x910/0x1028 03240 bch2_six_check_for_deadlock+0x1c/0x30 03240 six_lock_slowpath.isra.0+0x29c/0xed0 03240 six_lock_ip_waiter+0xa8/0xf8 03240 __bch2_btree_node_lock_write+0x14c/0x298 03240 bch2_trans_lock_write+0x6d4/0xb10 03240 __bch2_trans_commit+0x135c/0x5520 03240 btree_interior_update_work+0x1248/0x1c10 03240 process_scheduled_works+0x53c/0xd90 03240 worker_thread+0x370/0x8c8 03240 kthread+0x258/0x2e8 03240 ret_from_fork+0x10/0x20 03240 Code: aa1303e0 d63f0020 a94363f7 17ffff8c (d4210000) 03240 ---[ end trace 0000000000000000 ]--- 03240 Kernel panic - not syncing: Oops - BUG: Fatal exception 03240 SMP: stopping secondary CPUs 03241 SMP: failed to stop secondary CPUs 13,15 03241 Kernel Offset: disabled 03241 CPU features: 0x00,00000003,80000008,4240500b 03241 Memory Limit: none 03241 ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]--- 03246 ========= FAILED TIMEOUT copygc_torture_no_checksum in 7200s Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-06-20 13:45:09 +00:00
if (WARN(flags & CLOSURE_GUARD_MASK,
"closure has guard bits set: %x (%u)",
flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r)))
r &= ~CLOSURE_GUARD_MASK;
WARN(!r && (flags & ~CLOSURE_DESTRUCTOR),
"closure ref hit 0 with incorrect flags set: %x (%u)",
flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags));
}
static inline void closure_put_after_sub(struct closure *cl, int flags)
{
closure_put_after_sub_checks(flags);
if (!(flags & CLOSURE_REMAINING_MASK)) {
smp_acquire__after_ctrl_dep();
closures: Change BUG_ON() to WARN_ON() If a BUG_ON() can be hit in the wild, it shouldn't be a BUG_ON() For reference, this has popped up once in the CI, and we'll need more info to debug it: 03240 ------------[ cut here ]------------ 03240 kernel BUG at lib/closure.c:21! 03240 kernel BUG at lib/closure.c:21! 03240 Internal error: Oops - BUG: 00000000f2000800 [#1] SMP 03240 Modules linked in: 03240 CPU: 15 PID: 40534 Comm: kworker/u80:1 Not tainted 6.10.0-rc4-ktest-ga56da69799bd #25570 03240 Hardware name: linux,dummy-virt (DT) 03240 Workqueue: btree_update btree_interior_update_work 03240 pstate: 00001005 (nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 03240 pc : closure_put+0x224/0x2a0 03240 lr : closure_put+0x24/0x2a0 03240 sp : ffff0000d12071c0 03240 x29: ffff0000d12071c0 x28: dfff800000000000 x27: ffff0000d1207360 03240 x26: 0000000000000040 x25: 0000000000000040 x24: 0000000000000040 03240 x23: ffff0000c1f20180 x22: 0000000000000000 x21: ffff0000c1f20168 03240 x20: 0000000040000000 x19: ffff0000c1f20140 x18: 0000000000000001 03240 x17: 0000000000003aa0 x16: 0000000000003ad0 x15: 1fffe0001c326974 03240 x14: 0000000000000a1e x13: 0000000000000000 x12: 1fffe000183e402d 03240 x11: ffff6000183e402d x10: dfff800000000000 x9 : ffff6000183e402e 03240 x8 : 0000000000000001 x7 : 00009fffe7c1bfd3 x6 : ffff0000c1f2016b 03240 x5 : ffff0000c1f20168 x4 : ffff6000183e402e x3 : ffff800081391954 03240 x2 : 0000000000000001 x1 : 0000000000000000 x0 : 00000000a8000000 03240 Call trace: 03240 closure_put+0x224/0x2a0 03240 bch2_check_for_deadlock+0x910/0x1028 03240 bch2_six_check_for_deadlock+0x1c/0x30 03240 six_lock_slowpath.isra.0+0x29c/0xed0 03240 six_lock_ip_waiter+0xa8/0xf8 03240 __bch2_btree_node_lock_write+0x14c/0x298 03240 bch2_trans_lock_write+0x6d4/0xb10 03240 __bch2_trans_commit+0x135c/0x5520 03240 btree_interior_update_work+0x1248/0x1c10 03240 process_scheduled_works+0x53c/0xd90 03240 worker_thread+0x370/0x8c8 03240 kthread+0x258/0x2e8 03240 ret_from_fork+0x10/0x20 03240 Code: aa1303e0 d63f0020 a94363f7 17ffff8c (d4210000) 03240 ---[ end trace 0000000000000000 ]--- 03240 Kernel panic - not syncing: Oops - BUG: Fatal exception 03240 SMP: stopping secondary CPUs 03241 SMP: failed to stop secondary CPUs 13,15 03241 Kernel Offset: disabled 03241 CPU features: 0x00,00000003,80000008,4240500b 03241 Memory Limit: none 03241 ---[ end Kernel panic - not syncing: Oops - BUG: Fatal exception ]--- 03246 ========= FAILED TIMEOUT copygc_torture_no_checksum in 7200s Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-06-20 13:45:09 +00:00
cl->closure_get_happened = false;
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
CLOSURE_REMAINING_INITIALIZER);
closure_queue(cl);
} else {
struct closure *parent = cl->parent;
closure_fn *destructor = cl->fn;
closure_debug_destroy(cl);
if (destructor)
destructor(&cl->work);
if (parent)
closure_put(parent);
}
}
}
/* For clearing flags with the same atomic op as a put */
void closure_sub(struct closure *cl, int v)
{
closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
}
EXPORT_SYMBOL(closure_sub);
/*
* closure_put - decrement a closure's refcount
*/
void closure_put(struct closure *cl)
{
closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
}
EXPORT_SYMBOL(closure_put);
/*
* closure_wake_up - wake up all closures on a wait list, without memory barrier
*/
void __closure_wake_up(struct closure_waitlist *wait_list)
{
struct llist_node *list;
bcache: use llist_for_each_entry_safe() in __closure_wake_up() Commit 09b3efec ("bcache: Don't reinvent the wheel but use existing llist API") replaces the following while loop by llist_for_each_entry(), - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry(cl, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } This modification introduces a potential race by iterating a corrupted list. Here is how it happens. In the above modification, closure_sub() may wake up a process which is waiting on reverse list. If this process decides to wait again by calling closure_wait(), its cl->list will be added to another wait list. Then when llist_for_each_entry() continues to iterate next node, it will travel on another new wait list which is added in closure_wait(), not the original reverse list in __closure_wake_up(). It is more probably to happen on UP machine because the waked up process may preempt the process which wakes up it. Use llist_for_each_entry_safe() will fix the issue, the safe version fetch next node before waking up a process. Then the copy of next node will make sure list iteration stays on original reverse list. Fixes: 09b3efec81de ("bcache: Don't reinvent the wheel but use existing llist API") Signed-off-by: Coly Li <colyli@suse.de> Reported-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-09-26 09:54:12 +00:00
struct closure *cl, *t;
struct llist_node *reverse = NULL;
list = llist_del_all(&wait_list->list);
/* We first reverse the list to preserve FIFO ordering and fairness */
reverse = llist_reverse_order(list);
/* Then do the wakeups */
bcache: use llist_for_each_entry_safe() in __closure_wake_up() Commit 09b3efec ("bcache: Don't reinvent the wheel but use existing llist API") replaces the following while loop by llist_for_each_entry(), - - while (reverse) { - cl = container_of(reverse, struct closure, list); - reverse = llist_next(reverse); - + llist_for_each_entry(cl, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } This modification introduces a potential race by iterating a corrupted list. Here is how it happens. In the above modification, closure_sub() may wake up a process which is waiting on reverse list. If this process decides to wait again by calling closure_wait(), its cl->list will be added to another wait list. Then when llist_for_each_entry() continues to iterate next node, it will travel on another new wait list which is added in closure_wait(), not the original reverse list in __closure_wake_up(). It is more probably to happen on UP machine because the waked up process may preempt the process which wakes up it. Use llist_for_each_entry_safe() will fix the issue, the safe version fetch next node before waking up a process. Then the copy of next node will make sure list iteration stays on original reverse list. Fixes: 09b3efec81de ("bcache: Don't reinvent the wheel but use existing llist API") Signed-off-by: Coly Li <colyli@suse.de> Reported-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-09-26 09:54:12 +00:00
llist_for_each_entry_safe(cl, t, reverse, list) {
closure_set_waiting(cl, 0);
closure_sub(cl, CLOSURE_WAITING + 1);
}
}
EXPORT_SYMBOL(__closure_wake_up);
/**
* closure_wait - add a closure to a waitlist
* @waitlist: will own a ref on @cl, which will be released when
* closure_wake_up() is called on @waitlist.
* @cl: closure pointer.
*
*/
bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
{
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
cl->closure_get_happened = true;
closure_set_waiting(cl, _RET_IP_);
atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
llist_add(&cl->list, &waitlist->list);
return true;
}
EXPORT_SYMBOL(closure_wait);
struct closure_syncer {
struct task_struct *task;
int done;
};
static CLOSURE_CALLBACK(closure_sync_fn)
{
struct closure *cl = container_of(ws, struct closure, work);
struct closure_syncer *s = cl->s;
struct task_struct *p;
rcu_read_lock();
p = READ_ONCE(s->task);
s->done = 1;
wake_up_process(p);
rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
{
struct closure_syncer s = { .task = current };
cl->s = &s;
continue_at(cl, closure_sync_fn, NULL);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (s.done)
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}
EXPORT_SYMBOL(__closure_sync);
/*
* closure_return_sync - finish running a closure, synchronously (i.e. waiting
* for outstanding get()s to finish) and returning once closure refcount is 0.
*
* Unlike closure_sync() this doesn't reinit the ref to 1; subsequent
* closure_get_not_zero() calls waill fail.
*/
void __sched closure_return_sync(struct closure *cl)
{
struct closure_syncer s = { .task = current };
cl->s = &s;
set_closure_fn(cl, closure_sync_fn, NULL);
unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR,
&cl->remaining);
closure_put_after_sub_checks(flags);
if (unlikely(flags & CLOSURE_REMAINING_MASK)) {
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (s.done)
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}
if (cl->parent)
closure_put(cl->parent);
}
EXPORT_SYMBOL(closure_return_sync);
int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout)
{
struct closure_syncer s = { .task = current };
int ret = 0;
cl->s = &s;
continue_at(cl, closure_sync_fn, NULL);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (s.done)
break;
if (!timeout) {
/*
* Carefully undo the continue_at() - but only if it
* hasn't completed, i.e. the final closure_put() hasn't
* happened yet:
*/
unsigned old, new, v = atomic_read(&cl->remaining);
do {
old = v;
if (!old || (old & CLOSURE_RUNNING))
goto success;
new = old + CLOSURE_REMAINING_INITIALIZER;
} while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old);
ret = -ETIME;
}
timeout = schedule_timeout(timeout);
}
success:
__set_current_state(TASK_RUNNING);
return ret;
}
EXPORT_SYMBOL(__closure_sync_timeout);
#ifdef CONFIG_DEBUG_CLOSURES
static LIST_HEAD(closure_list);
static DEFINE_SPINLOCK(closure_list_lock);
void closure_debug_create(struct closure *cl)
{
unsigned long flags;
BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
cl->magic = CLOSURE_MAGIC_ALIVE;
spin_lock_irqsave(&closure_list_lock, flags);
list_add(&cl->all, &closure_list);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
EXPORT_SYMBOL(closure_debug_create);
void closure_debug_destroy(struct closure *cl)
{
unsigned long flags;
if (cl->magic == CLOSURE_MAGIC_STACK)
return;
BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
cl->magic = CLOSURE_MAGIC_DEAD;
spin_lock_irqsave(&closure_list_lock, flags);
list_del(&cl->all);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
EXPORT_SYMBOL(closure_debug_destroy);
static int debug_show(struct seq_file *f, void *data)
{
struct closure *cl;
spin_lock_irq(&closure_list_lock);
list_for_each_entry(cl, &closure_list, all) {
int r = atomic_read(&cl->remaining);
seq_printf(f, "%p: %pS -> %pS p %p r %i ",
cl, (void *) cl->ip, cl->fn, cl->parent,
r & CLOSURE_REMAINING_MASK);
seq_printf(f, "%s%s\n",
test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "",
r & CLOSURE_RUNNING ? "R" : "");
if (r & CLOSURE_WAITING)
seq_printf(f, " W %pS\n",
(void *) cl->waiting_on);
seq_puts(f, "\n");
}
spin_unlock_irq(&closure_list_lock);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(debug);
static int __init closure_debug_init(void)
{
debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
return 0;
}
late_initcall(closure_debug_init)
#endif