summaryrefslogtreecommitdiffstats
path: root/kernel/locking
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 14:18:38 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 14:18:38 -0800
commit24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (patch)
tree70d71381c841c92b2d28397bf0c5d6a7d9bbbaac /kernel/locking
parent9061cbe62adeccf8c986883bcd40f4aeee59ea75 (diff)
parent337f13046ff03717a9e99675284a817527440a49 (diff)
downloadlinux-0-day-24af98c4cf5f5e69266e270c7f3fb34b82ff6656.tar.gz
linux-0-day-24af98c4cf5f5e69266e270c7f3fb34b82ff6656.tar.xz
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "So we have a laundry list of locking subsystem changes: - continuing barrier API and code improvements - futex enhancements - atomics API improvements - pvqspinlock enhancements: in particular lock stealing and adaptive spinning - qspinlock micro-enhancements" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op futex: Cleanup the goto confusion in requeue_pi() futex: Remove pointless put_pi_state calls in requeue() futex: Document pi_state refcounting in requeue code futex: Rename free_pi_state() to put_pi_state() futex: Drop refcount if requeue_pi() acquired the rtmutex locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation lcoking/barriers, arch: Use smp barriers in smp_store_release() locking/cmpxchg, arch: Remove tas() definitions locking/pvqspinlock: Queue node adaptive spinning locking/pvqspinlock: Allow limited lock stealing locking/pvqspinlock: Collect slowpath lock statistics sched/core, locking: Document Program-Order guarantees locking, sched: Introduce smp_cond_acquire() and use it locking/pvqspinlock, x86: Optimize the PV unlock code path locking/qspinlock: Avoid redundant read of next pointer locking/qspinlock: Prefetch the next node cacheline locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg() atomics: Add test for atomic operations with _relaxed variants
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/qspinlock.c82
-rw-r--r--kernel/locking/qspinlock_paravirt.h252
-rw-r--r--kernel/locking/qspinlock_stat.h300
3 files changed, 576 insertions, 58 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5d..393d1874b9e0c 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@@ -382,6 +398,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
@@ -406,11 +433,22 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * smp_load_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+ smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+
+locked:
/*
* claim the lock:
*
@@ -422,11 +460,17 @@ queue:
* to grab the lock.
*/
for (;;) {
- if (val != tail) {
+ /* In the PV case we might already have _Q_LOCKED_VAL set */
+ if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_load_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@@ -434,10 +478,12 @@ queue:
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b6..87bb235c34480 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
@@ -41,6 +55,94 @@ struct pv_node {
};
/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+ == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
return &he->lock;
}
}
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
}
/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int waitcnt = 0;
int loop;
+ bool wait_early;
- for (;;) {
- for (loop = SPIN_THRESHOLD; loop; loop--) {
+ /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+ for (;; waitcnt++) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
cpu_relax();
}
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that value
- * so that pv_wait_head() knows to not also try to hash this lock.
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
/*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
- for (;;) {
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (trylock_clear_pending(lock))
+ goto gotlock;
cpu_relax();
}
+ clear_pending(lock);
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
+ WRITE_ONCE(pn->state, vcpu_halted);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
*/
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
+ qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 0000000000000..640dcecdd1df7
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups
+ * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ counter = (long)(file->f_inode->i_private);
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac;
+
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ stat = 0;
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat) {
+ pr_warn("Could not create 'qlockstat' debugfs directory\n");
+ return 0;
+ }
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat);
+
+ debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat);
+ return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+ int ret = pv_queued_spin_steal_lock(lock);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+#undef queued_spin_trylock
+#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */