From d22cc7f67d55ebf2d5be865453971c783e9fb21a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 30 Mar 2020 17:30:02 -0400 Subject: locking/percpu-rwsem: Fix a task_struct refcount The following commit: 7f26482a872c ("locking/percpu-rwsem: Remove the embedded rwsem") introduced task_struct memory leaks due to messing up the task_struct refcount. At the beginning of percpu_rwsem_wake_function(), it calls get_task_struct(), but if the trylock failed, it will remain in the waitqueue. However, it will run percpu_rwsem_wake_function() again with get_task_struct() to increase the refcount but then only call put_task_struct() once the trylock succeeded. Fix it by adjusting percpu_rwsem_wake_function() a bit to guard against when percpu_rwsem_wait() observing !private, terminating the wait and doing a quick exit() while percpu_rwsem_wake_function() then doing wake_up_process(p) as a use-after-free. Fixes: 7f26482a872c ("locking/percpu-rwsem: Remove the embedded rwsem") Suggested-by: Peter Zijlstra Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200330213002.2374-1-cai@lca.pw --- kernel/locking/percpu-rwsem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index a008a1ba21a7..8bbafe3e5203 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int wake_flags, void *key) { - struct task_struct *p = get_task_struct(wq_entry->private); bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; struct percpu_rw_semaphore *sem = key; + struct task_struct *p; /* concurrent against percpu_down_write(), can get stolen */ if (!__percpu_rwsem_trylock(sem, reader)) return 1; + p = get_task_struct(wq_entry->private); list_del_init(&wq_entry->entry); smp_store_release(&wq_entry->private, NULL); -- cgit v1.2.3 From a13f58a0cafa7b0416a2898bc3b0defbb305d108 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 3 Mar 2020 11:54:27 +0100 Subject: locking/refcount: Document interaction with PID_MAX_LIMIT Document the circumstances under which refcount_t's saturation mechanism works deterministically. Acked-by: Kees Cook Acked-by: Will Deacon Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200303105427.260620-1-jannh@google.com --- include/linux/refcount.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 0ac50cf62d06..0e3ee25eb156 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -38,11 +38,24 @@ * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the - * underlying object. Given the precise timing details involved with the - * round-robin scheduling of each thread manipulating the refcount and the need - * to hit the race multiple times in succession, there doesn't appear to be a - * practical avenue of attack even if using refcount_add() operations with - * larger increments. + * underlying object. + * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently + * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). + * With the current PID limit, if no batched refcounting operations are used and + * the attacker can't repeatedly trigger kernel oopses in the middle of refcount + * operations, this makes it impossible for a saturated refcount to leave the + * saturation range, even if it is possible for multiple uses of the same + * refcount to nest in the context of a single task: + * + * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = + * 0x40000000 / 0x400000 = 0x100 = 256 + * + * If hundreds of references are added/removed with a single refcounting + * operation, it may potentially be possible to leave the saturation range; but + * given the precise timing details involved with the round-robin scheduling of + * each thread manipulating the refcount and the need to hit the race multiple + * times in succession, there doesn't appear to be a practical avenue of attack + * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== -- cgit v1.2.3 From 9a019db0b6bebc84d6b64636faf73ed6d64cd4bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Mar 2020 20:38:12 +0200 Subject: locking/lockdep: Improve 'invalid wait context' splat The 'invalid wait context' splat doesn't print all the information required to reconstruct / validate the error, specifically the irq-context state is missing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 51 +++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1511690e4de7..ac10db66cc63 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static inline short task_wait_context(struct task_struct *curr) +{ + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + return LD_WAIT_CONFIG; + + return LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + return LD_WAIT_CONFIG; + } + + return LD_WAIT_MAX; +} + static int print_lock_invalid_wait_context(struct task_struct *curr, struct held_lock *hlock) { + short curr_inner; + if (!debug_locks_off()) return 0; if (debug_locks_silent) @@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr, print_lock(hlock); pr_warn("other info that might help us debug this:\n"); + + curr_inner = task_wait_context(curr); + pr_warn("context-{%d:%d}\n", curr_inner, curr_inner); + lockdep_print_held_locks(curr); pr_warn("stack backtrace:\n"); @@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) } depth++; - /* - * Set appropriate wait type for the context; for IRQs we have to take - * into account force_irqthread as that is implied by PREEMPT_RT. - */ - if (curr->hardirq_context) { - /* - * Check if force_irqthreads will run us threaded. - */ - if (curr->hardirq_threaded || curr->irq_config) - curr_inner = LD_WAIT_CONFIG; - else - curr_inner = LD_WAIT_SPIN; - } else if (curr->softirq_context) { - /* - * Softirqs are always threaded. - */ - curr_inner = LD_WAIT_CONFIG; - } else { - curr_inner = LD_WAIT_MAX; - } + curr_inner = task_wait_context(curr); for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; -- cgit v1.2.3