summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/blk-core.c2
-rw-r--r--block/blk-mq.c229
-rw-r--r--block/blk-mq.h46
-rw-r--r--block/blk-timeout.c2
-rw-r--r--block/blk.h6
-rw-r--r--include/linux/blk-mq.h1
-rw-r--r--include/linux/blkdev.h23
7 files changed, 230 insertions, 79 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 2e0d041e2dafb..f843ae4f858de 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82be..156203876c8c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
bool shared = false;
int cpu;
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
}
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+ unsigned long flags;
+
+ /*
+ * blk_mq_rq_aborted_gstate() is used from the completion path and
+ * can thus be called from irq context. u64_stats_fetch in the
+ * middle of update on the same CPU leads to lockup. Disable irq
+ * while updating.
+ */
+ local_irq_save(flags);
+ u64_stats_update_begin(&rq->aborted_gstate_sync);
+ rq->aborted_gstate = gstate;
+ u64_stats_update_end(&rq->aborted_gstate_sync);
+ local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+ unsigned int start;
+ u64 aborted_gstate;
+
+ do {
+ start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+ aborted_gstate = rq->aborted_gstate;
+ } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+ return aborted_gstate;
+}
+
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
if (unlikely(blk_should_fake_timeout(q)))
return;
+ /*
+ * If @rq->aborted_gstate equals the current instance, timeout is
+ * claiming @rq and we lost. This is synchronized through
+ * hctx_lock(). See blk_mq_timeout_work() for details.
+ *
+ * Completion path never blocks and we can directly use RCU here
+ * instead of hctx_lock() which can be either RCU or SRCU.
+ * However, that would complicate paths which want to synchronize
+ * against us. Let stay in sync with the issue path so that
+ * hctx_lock() covers both issue and completion paths.
+ */
hctx_lock(hctx, &srcu_idx);
- if (!blk_mark_rq_complete(rq))
+ if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
+ !blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
hctx_unlock(hctx, srcu_idx);
}
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
wbt_issue(q->rq_wb, &rq->issue_stat);
}
- blk_add_timer(rq);
-
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
/*
- * Mark us as started and clear complete. Complete might have been
- * set if requeue raced with timeout, which then marked it as
- * complete. So be sure to clear complete again when we start
- * the request, otherwise we'll ignore the completion event.
+ * Mark @rq in-flight which also advances the generation number,
+ * and register for timeout. Protect with a seqcount to allow the
+ * timeout path to read both @rq->gstate and @rq->deadline
+ * coherently.
*
- * Ensure that ->deadline is visible before we set STARTED, such that
- * blk_mq_check_expired() is guaranteed to observe our ->deadline when
- * it observes STARTED.
+ * This is the only place where a request is marked in-flight. If
+ * the timeout path reads an in-flight @rq->gstate, the
+ * @rq->deadline it reads together under @rq->gstate_seq is
+ * guaranteed to be the matching one.
*/
- smp_wmb();
+ preempt_disable();
+ write_seqcount_begin(&rq->gstate_seq);
+
+ blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+ blk_add_timer(rq);
+
+ write_seqcount_end(&rq->gstate_seq);
+ preempt_enable();
+
set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
- /*
- * Coherence order guarantees these consecutive stores to a
- * single variable propagate in the specified order. Thus the
- * clear_bit() is ordered _after_ the set bit. See
- * blk_mq_check_expired().
- *
- * (the bits must be part of the same byte for this to be
- * true).
- */
+ if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
- }
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
blk_mq_sched_requeue_request(rq);
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
struct blk_mq_timeout_data {
unsigned long next;
unsigned int next_set;
+ unsigned int nr_expired;
};
void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
+ /*
+ * As nothing prevents from completion happening while
+ * ->aborted_gstate is set, this may lead to ignored
+ * completions and further spurious timeouts.
+ */
+ blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
blk_clear_rq_complete(req);
break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
struct blk_mq_timeout_data *data = priv;
- unsigned long deadline;
+ unsigned long gstate, deadline;
+ int start;
+
+ might_sleep();
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return;
- /*
- * Ensures that if we see STARTED we must also see our
- * up-to-date deadline, see blk_mq_start_request().
- */
- smp_rmb();
-
- deadline = READ_ONCE(rq->deadline);
+ /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+ while (true) {
+ start = read_seqcount_begin(&rq->gstate_seq);
+ gstate = READ_ONCE(rq->gstate);
+ deadline = rq->deadline;
+ if (!read_seqcount_retry(&rq->gstate_seq, start))
+ break;
+ cond_resched();
+ }
- /*
- * The rq being checked may have been freed and reallocated
- * out already here, we avoid this race by checking rq->deadline
- * and REQ_ATOM_COMPLETE flag together:
- *
- * - if rq->deadline is observed as new value because of
- * reusing, the rq won't be timed out because of timing.
- * - if rq->deadline is observed as previous value,
- * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
- * because we put a barrier between setting rq->deadline
- * and clearing the flag in blk_mq_start_request(), so
- * this rq won't be timed out too.
- */
- if (time_after_eq(jiffies, deadline)) {
- if (!blk_mark_rq_complete(rq)) {
- /*
- * Again coherence order ensures that consecutive reads
- * from the same variable must be in that order. This
- * ensures that if we see COMPLETE clear, we must then
- * see STARTED set and we'll ignore this timeout.
- *
- * (There's also the MB implied by the test_and_clear())
- */
- blk_mq_rq_timed_out(rq, reserved);
- }
+ /* if in-flight && overdue, mark for abortion */
+ if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+ time_after_eq(jiffies, deadline)) {
+ blk_mq_rq_update_aborted_gstate(rq, gstate);
+ data->nr_expired++;
+ hctx->nr_expired++;
} else if (!data->next_set || time_after(data->next, deadline)) {
data->next = deadline;
data->next_set = 1;
}
}
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv, bool reserved)
+{
+ /*
+ * We marked @rq->aborted_gstate and waited for RCU. If there were
+ * completions that we lost to, they would have finished and
+ * updated @rq->gstate by now; otherwise, the completion path is
+ * now guaranteed to see @rq->aborted_gstate and yield. If
+ * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+ */
+ if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
+ !blk_mark_rq_complete(rq))
+ blk_mq_rq_timed_out(rq, reserved);
+}
+
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
+ .nr_expired = 0,
};
+ struct blk_mq_hw_ctx *hctx;
int i;
/* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
+ /* scan for the expired ones and set their ->aborted_gstate */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+ if (data.nr_expired) {
+ bool has_rcu = false;
+
+ /*
+ * Wait till everyone sees ->aborted_gstate. The
+ * sequential waits for SRCUs aren't ideal. If this ever
+ * becomes a problem, we can add per-hw_ctx rcu_head and
+ * wait in parallel.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->nr_expired)
+ continue;
+
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ has_rcu = true;
+ else
+ synchronize_srcu(hctx->queue_rq_srcu);
+
+ hctx->nr_expired = 0;
+ }
+ if (has_rcu)
+ synchronize_rcu();
+
+ /* terminate the ones we won */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+ }
+
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
- struct blk_mq_hw_ctx *hctx;
-
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
return (size_t)PAGE_SIZE << order;
}
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, int node)
+{
+ int ret;
+
+ if (set->ops->init_request) {
+ ret = set->ops->init_request(set, rq, hctx_idx, node);
+ if (ret)
+ return ret;
+ }
+
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
+ return 0;
+}
+
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth)
{
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
struct request *rq = p;
tags->static_rqs[i] = rq;
- if (set->ops->init_request) {
- if (set->ops->init_request(set, rq, hctx_idx,
- node)) {
- tags->static_rqs[i] = NULL;
- goto fail;
- }
+ if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+ tags->static_rqs[i] = NULL;
+ goto fail;
}
p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (!hctx->fq)
goto sched_exit_hctx;
- if (set->ops->init_request &&
- set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
- node))
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
static int __init blk_mq_init(void)
{
- /*
- * See comment in block/blk.h rq_atomic_flags enum
- */
- BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
- (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf627..cf01f6f8c73dc 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,19 @@ struct blk_mq_ctx {
struct kobject kobj;
} ____cacheline_aligned_in_smp;
+/*
+ * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+ MQ_RQ_IDLE = 0,
+ MQ_RQ_IN_FLIGHT = 1,
+
+ MQ_RQ_STATE_BITS = 2,
+ MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
+ MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
+};
+
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+ return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state. The caller is responsible for ensuring that
+ * there are no other updaters. A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+ enum mq_rq_state state)
+{
+ u64 old_val = READ_ONCE(rq->gstate);
+ u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+ if (state == MQ_RQ_IN_FLIGHT) {
+ WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+ new_val += MQ_RQ_GEN_INC;
+ }
+
+ /* avoid exposing interim values */
+ WRITE_ONCE(rq->gstate, new_val);
+}
+
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb305..6427be7ac3637 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
if (!req->timeout)
req->timeout = q->rq_timeout;
- WRITE_ONCE(req->deadline, jiffies + req->timeout);
+ req->deadline = jiffies + req->timeout;
/*
* Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aece..9cb2739edb6af 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
- /*
- * Keep these two bits first - not because we depend on the
- * value of them, but we do depend on them being in the same
- * byte of storage to ensure ordering on writes. Keeping them
- * first will achieve that nicely.
- */
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 95c9a5c862e25..460798dbac1fd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
unsigned int queue_num;
atomic_t nr_active;
+ unsigned int nr_expired;
struct hlist_node cpuhp_dead;
struct kobject kobj;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46e606f5b44b3..ae563d01b29d7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,8 @@
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
+#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
struct module;
struct scsi_ioctl_command;
@@ -230,6 +232,27 @@ struct request {
unsigned short write_hint;
+ /*
+ * On blk-mq, the lower bits of ->gstate (generation number and
+ * state) carry the MQ_RQ_* state value and the upper bits the
+ * generation number which is monotonically incremented and used to
+ * distinguish the reuse instances.
+ *
+ * ->gstate_seq allows updates to ->gstate and other fields
+ * (currently ->deadline) during request start to be read
+ * atomically from the timeout path, so that it can operate on a
+ * coherent set of information.
+ */
+ seqcount_t gstate_seq;
+ u64 gstate;
+
+ /*
+ * ->aborted_gstate is used by the timeout to claim a specific
+ * recycle instance of this request. See blk_mq_timeout_work().
+ */
+ struct u64_stats_sync aborted_gstate_sync;
+ u64 aborted_gstate;
+
unsigned long deadline;
struct list_head timeout_list;