From 15eb7c888e749fbd1cc0370f3d38de08ad903700 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 31 Aug 2021 08:38:19 +0200 Subject: locking/rwsem: Add missing __init_rwsem() for PREEMPT_RT 730633f0b7f95 became the first direct caller of __init_rwsem() vs the usual init_rwsem(), exposing PREEMPT_RT's lack thereof. Add it. [ tglx: Move it out of line ] Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/50a936b7d8f12277d6ec7ed2ef0421a381056909.camel@gmx.de --- include/linux/rwsem.h | 12 ++---------- kernel/locking/rwsem.c | 10 ++++++---- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 426e98e0b675..352c6127cb90 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -142,22 +142,14 @@ struct rw_semaphore { #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, +extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); -#else -static inline void __rwsem_init(struct rw_semaphore *rwsem, const char *name, - struct lock_class_key *key) -{ -} -#endif #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ - init_rwbase_rt(&(sem)->rwbase); \ - __rwsem_init((sem), #sem, &__key); \ + __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9215b4d6a9de..000e8d5a2884 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #include "rwbase_rt.c" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __rwsem_init(struct rw_semaphore *sem, const char *name, +void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); -} -EXPORT_SYMBOL(__rwsem_init); #endif +} +EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { -- cgit v1.2.3 From a974b54036f79dd5e395e9f6c80c3decb4661a14 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Aug 2021 14:18:40 +0100 Subject: futex: Return error code instead of assigning it without effect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check on the rt_waiter and top_waiter->pi_state is assigning an error return code to ret but this later gets re-assigned, hence the check is ineffective. Return -EINVAL rather than assigning it to ret which was the original intent. Fixes: dc7109aaa233 ("futex: Validate waiter correctly in futex_proxy_trylock_atomic()") Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Reviewed-by: André Almeida Link: https://lore.kernel.org/r/20210818131840.34262-1-colin.king@canonical.com --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index e7b4c6121da4..30e7daebaec8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2025,7 +2025,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) - ret = -EINVAL; + return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) -- cgit v1.2.3 From 4f07ec0d76f242d4ca0f0c0c6f7293c28254a554 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:48 +0200 Subject: futex: Prevent inconsistent state and exit race The recent rework of the requeue PI code introduced a possibility for going back to user space in inconsistent state: CPU 0 CPU 1 requeue_futex() if (lock_pifutex_user()) { dequeue_waiter(); wake_waiter(task); sched_in(task); return_from_futex_syscall(); ---> Inconsistent state because PI state is not established It becomes worse if the woken up task immediately exits: sys_exit(); attach_pistate(vpid); <--- FAIL Attach the pi state before dequeuing and waking the waiter. If the waiter gets a spurious wakeup before the dequeue operation it will wait in futex_requeue_pi_wakeup_sync() and therefore cannot return and exit. Fixes: 07d91ef510fb ("futex: Prevent requeue_pi() lock nesting issue on RT") Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de --- kernel/futex.c | 98 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 30e7daebaec8..04164d440d98 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, newval |= FUTEX_WAITERS; ret = lock_pi_update_atomic(uaddr, uval, newval); - /* If the take over worked, return 1 */ - return ret < 0 ? ret : 1; + if (ret) + return ret; + + /* + * If the waiter bit was requested the caller also needs PI + * state attached to the new owner of the user space futex. + * + * @task is guaranteed to be alive and it cannot be exiting + * because it is either sleeping or waiting in + * futex_requeue_pi_wakeup_sync(). + */ + if (set_waiters) { + ret = attach_to_pi_owner(uaddr, newval, key, ps, + exiting); + WARN_ON(ret); + } + return 1; } /* @@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EAGAIN; /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { - /* Dequeue, wake up and update top_waiter::requeue_state */ + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); @@ -2208,19 +2230,26 @@ retry_private: &exiting, nr_requeue); /* - * At this point the top_waiter has either taken uaddr2 or is - * waiting on it. If the former, then the pi_state will not - * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, @ret contains the - * VPID of the top waiter task. - * If the lock was not taken, we have pi_state and an initial - * refcount on it. In case of an error we have nothing. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. * * The top waiter's requeue_state is up to date: * - * - If the lock was acquired atomically (ret > 0), then + * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an @@ -2234,36 +2263,20 @@ retry_private: * the same sanity checks for requeue_pi as the loop * below does. */ - if (ret > 0) { - WARN_ON(pi_state); - task_count++; - /* - * If futex_proxy_trylock_atomic() acquired the - * user space futex, then the user space value - * @uaddr2 has been set to the @hb1's top waiter - * task VPID. This task is guaranteed to be alive - * and cannot be exiting because it is either - * sleeping or blocked on @hb2 lock. - * - * The @uaddr2 futex cannot have waiters either as - * otherwise futex_proxy_trylock_atomic() would not - * have succeeded. - * - * In order to requeue waiters to @hb2, pi state is - * required. Hand in the VPID value (@ret) and - * allocate PI state with an initial refcount on - * it. - */ - ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, - &exiting); - WARN_ON(ret); - } - switch (ret) { case 0: /* We hold a reference on the pi state. */ break; + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. @@ -2395,9 +2408,8 @@ retry_private: } /* - * We took an extra initial reference to the pi_state either in - * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need - * to drop it here again. + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); -- cgit v1.2.3 From 249955e51c8136189b3c66f54e212981a1350a0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:50 +0200 Subject: futex: Clarify comment for requeue_pi_wake_futex() It's slightly confusing. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.618613025@linutronix.de --- kernel/futex.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 04164d440d98..82cd270f25a0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1954,12 +1954,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -- cgit v1.2.3 From 340576590dac4bb58d532a8ad5bfa806d8ab473c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 2 Sep 2021 11:48:51 +0200 Subject: futex: Avoid redundant task lookup No need to do the full VPID based task lookup and validation of the top waiter when the user space futex was acquired on it's behalf during the requeue_pi operation. The task is known already and it cannot go away before requeue_pi_wake_futex() has been invoked. Split out the actual attach code from attach_pi_state_owner() and use that instead of the full blown variant. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210902094414.676104881@linutronix.de --- kernel/futex.c | 67 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 82cd270f25a0..a316dce74c3d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, return -ESRCH; } +static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, + struct futex_pi_state **ps) +{ + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + struct futex_pi_state *pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + + *ps = pi_state; +} /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. @@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; - struct futex_pi_state *pi_state; struct task_struct *p; /* @@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return ret; } - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - /* - * Assignment without holding pi_state->pi_mutex.wait_lock is safe - * because there is no concurrency as the object is not published yet. - */ - pi_state->owner = p; + __attach_to_pi_owner(p, key, ps); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); - *ps = pi_state; - return 0; } @@ -1464,11 +1468,14 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * @task is guaranteed to be alive and it cannot be exiting * because it is either sleeping or waiting in * futex_requeue_pi_wakeup_sync(). + * + * No need to do the full attach_to_pi_owner() exercise + * because @task is known and valid. */ if (set_waiters) { - ret = attach_to_pi_owner(uaddr, newval, key, ps, - exiting); - WARN_ON(ret); + raw_spin_lock_irq(&task->pi_lock); + __attach_to_pi_owner(task, key, ps); + raw_spin_unlock_irq(&task->pi_lock); } return 1; } -- cgit v1.2.3 From d66e3edee7af87fe212df611ab9846b987a5070f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 22:47:06 +0200 Subject: futex: Remove unused variable 'vpid' in futex_proxy_trylock_atomic() The recent bug fix left the variable 'vpid' and an assignment to it around, but the variable is otherwise unused. clang dose not complain even with W=1, but gcc exposed this. Fixes: 4f07ec0d76f2 ("futex: Prevent inconsistent state and exit race") Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index a316dce74c3d..c15ad276fd15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2034,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, { struct futex_q *top_waiter = NULL; u32 curval; - int ret, vpid; + int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -2079,7 +2079,6 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * the user space lock can be acquired then PI state is attached to * the new owner (@top_waiter->task) when @set_waiters is true. */ - vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { -- cgit v1.2.3 From e5480572706da1b2c2dc2c6484eab64f92b9263b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Sep 2021 11:44:11 +0200 Subject: locking/rtmutex: Fix ww_mutex deadlock check Dan reported that rt_mutex_adjust_prio_chain() can be called with .orig_waiter == NULL however commit a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") unconditionally dereferences it. Since both call-sites that have .orig_waiter == NULL don't care for the return value, simply disable the deadlock squash by adding the NULL check. Notably, both callers use the deadlock condition as a termination condition for the iteration; once detected, it is sure that (de)boosting is done. Arguably step [3] would be a more natural termination point, but it's dubious whether adding a third deadlock detection state would improve the code. Fixes: a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/YS9La56fHMiCCo75@hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8eabdc79602b..6bb116c559b4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * other configuration and we fail to report; also, see * lockdep. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) ret = 0; raw_spin_unlock(&lock->wait_lock); -- cgit v1.2.3