diff --git a/fs/exec.c b/fs/exec.c index ab913243a367..58f16312b983 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1010,6 +1010,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + mm_init_cid(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm, */ check_unsafe_exec(bprm); current->in_execve = 1; + sched_mm_cid_before_execve(current); file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); @@ -1852,6 +1854,7 @@ static int bprm_execve(struct linux_binprm *bprm, if (retval < 0) goto out; + sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; @@ -1871,6 +1874,7 @@ static int bprm_execve(struct linux_binprm *bprm, force_fatal_sig(SIGSEGV); out_unmark: + sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index f3f196e4d66d..cf008c26a883 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1976,6 +1976,31 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return raw_smp_processor_id(); +} +#endif + #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b8475007734..1c3bf76063d2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -645,7 +645,18 @@ struct mm_struct { * &struct mm_struct is freed. */ atomic_t mm_count; - +#ifdef CONFIG_SCHED_MM_CID + /** + * @cid_lock: Protect cid bitmap updates vs lookups. + * + * Prevent situations where updates to the cid bitmap happen + * concurrently with lookups. Those can lead to situations + * where a lookup cannot find a free bit simply because it was + * unlucky enough to load, non-atomically, bitmap words as they + * were being concurrently updated by the updaters. + */ + raw_spinlock_t cid_lock; +#endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif @@ -909,6 +920,36 @@ static inline void vma_iter_init(struct vma_iterator *vmi, vmi->mas.node = MAS_START; } +#ifdef CONFIG_SCHED_MM_CID +/* Accessor for struct mm_struct's cidmask. */ +static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +{ + unsigned long cid_bitmap = (unsigned long)mm; + + cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); + /* Skip cpu_bitmap */ + cid_bitmap += cpumask_size(); + return (struct cpumask *)cid_bitmap; +} + +static inline void mm_init_cid(struct mm_struct *mm) +{ + raw_spin_lock_init(&mm->cid_lock); + cpumask_clear(mm_cidmask(mm)); +} + +static inline unsigned int mm_cid_size(void) +{ + return cpumask_size(); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_init_cid(struct mm_struct *mm) { } +static inline unsigned int mm_cid_size(void) +{ + return 0; +} +#endif /* CONFIG_SCHED_MM_CID */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/sched.h b/include/linux/sched.h index e0bc020a63a9..4df2b3e76b30 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1311,6 +1311,11 @@ struct task_struct { unsigned long rseq_event_mask; #endif +#ifdef CONFIG_SCHED_MM_CID + int mm_cid; /* Current cid in mm */ + int mm_cid_active; /* Whether cid bitmap is active */ +#endif + struct tlbflush_unmap_batch tlb_ubc; union { diff --git a/init/Kconfig b/init/Kconfig index 7e5c3ddc341d..1ce960aa453e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1041,6 +1041,10 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED +config SCHED_MM_CID + def_bool y + depends on SMP && RSEQ + config UCLAMP_TASK_GROUP bool "Utilization clamping per group of tasks" depends on CGROUP_SCHED diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..82b2b5846aae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_SCHED_MM_CID + tsk->mm_cid = -1; + tsk->mm_cid_active = 0; +#endif return tsk; free_stack: @@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); + mm_init_cid(mm); return mm; fail_pcpu: @@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->mm = mm; tsk->active_mm = mm; + sched_mm_cid_fork(tsk); return 0; } @@ -3034,7 +3040,7 @@ void __init mm_cache_init(void) * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25b582b6ee5f..75830b7dee8f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5052,6 +5052,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); + switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -11305,3 +11306,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm); + + local_irq_save(flags); + t->mm_cid = mm_cid_get(mm); + t->mm_cid_active = 1; + local_irq_restore(flags); + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3d6e819127c..c2d7467fdde1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3269,4 +3269,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr, cgroup_account_cputime(curr, delta_exec); } +#ifdef CONFIG_SCHED_MM_CID +static inline int __mm_cid_get(struct mm_struct *mm) +{ + struct cpumask *cpumask; + int cid; + + cpumask = mm_cidmask(mm); + cid = cpumask_first_zero(cpumask); + if (cid >= nr_cpu_ids) + return -1; + __cpumask_set_cpu(cid, cpumask); + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm, int cid) +{ + lockdep_assert_irqs_disabled(); + if (cid < 0) + return; + raw_spin_lock(&mm->cid_lock); + __cpumask_clear_cpu(cid, mm_cidmask(mm)); + raw_spin_unlock(&mm->cid_lock); +} + +static inline int mm_cid_get(struct mm_struct *mm) +{ + int ret; + + lockdep_assert_irqs_disabled(); + raw_spin_lock(&mm->cid_lock); + ret = __mm_cid_get(mm); + raw_spin_unlock(&mm->cid_lock); + return ret; +} + +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +{ + if (prev->mm_cid_active) { + if (next->mm_cid_active && next->mm == prev->mm) { + /* + * Context switch between threads in same mm, hand over + * the mm_cid from prev to next. + */ + next->mm_cid = prev->mm_cid; + prev->mm_cid = -1; + return; + } + mm_cid_put(prev->mm, prev->mm_cid); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->mm_cid = mm_cid_get(next->mm); +} + +#else +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/signal.c b/kernel/signal.c index ae26da61c4d9..8cb28f1df294 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk);