summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c478
1 files changed, 237 insertions, 241 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 771ae9730ac68..495b9ddb3355c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -49,9 +49,39 @@
*
* In particular, to provide these low-latency guarantees, BFQ
* explicitly privileges the I/O of two classes of time-sensitive
- * applications: interactive and soft real-time. This feature enables
- * BFQ to provide applications in these classes with a very low
- * latency. Finally, BFQ also features additional heuristics for
+ * applications: interactive and soft real-time. In more detail, BFQ
+ * behaves this way if the low_latency parameter is set (default
+ * configuration). This feature enables BFQ to provide applications in
+ * these classes with a very low latency.
+ *
+ * To implement this feature, BFQ constantly tries to detect whether
+ * the I/O requests in a bfq_queue come from an interactive or a soft
+ * real-time application. For brevity, in these cases, the queue is
+ * said to be interactive or soft real-time. In both cases, BFQ
+ * privileges the service of the queue, over that of non-interactive
+ * and non-soft-real-time queues. This privileging is performed,
+ * mainly, by raising the weight of the queue. So, for brevity, we
+ * call just weight-raising periods the time periods during which a
+ * queue is privileged, because deemed interactive or soft real-time.
+ *
+ * The detection of soft real-time queues/applications is described in
+ * detail in the comments on the function
+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an
+ * interactive queue works as follows: a queue is deemed interactive
+ * if it is constantly non empty only for a limited time interval,
+ * after which it does become empty. The queue may be deemed
+ * interactive again (for a limited time), if it restarts being
+ * constantly non empty, provided that this happens only after the
+ * queue has remained empty for a given minimum idle time.
+ *
+ * By default, BFQ computes automatically the above maximum time
+ * interval, i.e., the time interval after which a constantly
+ * non-empty queue stops being deemed interactive. Since a queue is
+ * weight-raised while it is deemed interactive, this maximum time
+ * interval happens to coincide with the (maximum) duration of the
+ * weight-raising for interactive queues.
+ *
+ * Finally, BFQ also features additional heuristics for
* preserving both a low latency and a high throughput on NCQ-capable,
* rotational or flash-based devices, and to get the job done quickly
* for applications consisting in many I/O-bound processes.
@@ -61,14 +91,14 @@
* all low-latency heuristics for that device, by setting low_latency
* to 0.
*
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
+ * BFQ is described in [1], where also a reference to the initial,
+ * more theoretical paper on BFQ can be found. The interested reader
+ * can find in the latter paper full details on the main algorithm, as
+ * well as formulas of the guarantees and formal proofs of all the
+ * properties. With respect to the version of BFQ presented in these
+ * papers, this implementation adds a few more heuristics, such as the
+ * ones that guarantee a low latency to interactive and soft real-time
+ * applications, and a hierarchical extension based on H-WF2Q+.
*
* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
@@ -218,56 +248,46 @@ static struct kmem_cache *bfq_pool;
#define BFQ_RATE_SHIFT 16
/*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see
- * below), and T is a reference time: given the systems that are
- * likely to be installed on the reference device according to its
- * speed class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is obtained).
- * In practice, the slower/faster the device at hand is, the more/less
- * it takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to
- * interactive applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- * SD cards, or fast, such as newer HDDs and SSDs.
+ * When configured for computing the duration of the weight-raising
+ * for interactive queues automatically (see the comments at the
+ * beginning of this file), BFQ does it using the following formula:
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters. In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained). In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
+ * reference device. Accordingly, the longer/shorter BFQ grants
+ * weight raising to interactive applications.
*
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
*
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
*
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ * The reference peak rates are measured in sectors/usec, left-shifted
+ * by BFQ_RATE_SHIFT.
*/
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
+static int ref_rate[2] = {14000, 33000};
/*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
+ * To improve readability, a conversion function is used to initialize
+ * the following array, which entails that the array can be
+ * initialized only in a function.
*/
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
+static int ref_wr_duration[2];
/*
* BFQ uses the above-detailed, time-based weight-raising mechanism to
@@ -487,46 +507,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
/*
- * See the comments on bfq_limit_depth for the purpose of
- * the depths set in the function.
- */
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
-{
- bfqd->sb_shift = bt->sb.shift;
-
- /*
- * In-word depths if no bfq_queue is being weight-raised:
- * leaving 25% of tags only for sync reads.
- *
- * In next formulas, right-shift the value
- * (1U<<bfqd->sb_shift), instead of computing directly
- * (1U<<(bfqd->sb_shift - something)), to be robust against
- * any possible value of bfqd->sb_shift, without having to
- * limit 'something'.
- */
- /* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
- /*
- * no more than 75% of tags for sync writes (25% extra tags
- * w.r.t. async I/O, to prevent async I/O from starving sync
- * writes)
- */
- bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
-
- /*
- * In-word depths in case some bfq_queue is being weight-
- * raised: leaving ~63% of tags for sync reads. This is the
- * highest percentage for which, in our tests, application
- * start-up times didn't suffer from any regression due to tag
- * shortage.
- */
- /* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
- /* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
-}
-
-/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
@@ -535,25 +515,11 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
- struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct bfq_data *bfqd = data->q->elevator->elevator_data;
- struct sbitmap_queue *bt;
if (op_is_sync(op) && !op_is_write(op))
return;
- if (data->flags & BLK_MQ_REQ_RESERVED) {
- if (unlikely(!tags->nr_reserved_tags)) {
- WARN_ON_ONCE(1);
- return;
- }
- bt = &tags->breserved_tags;
- } else
- bt = &tags->bitmap_tags;
-
- if (unlikely(bfqd->sb_shift != bt->sb.shift))
- bfq_update_depths(bfqd, bt);
-
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
@@ -906,26 +872,30 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
if (bfqd->bfq_wr_max_time > 0)
return bfqd->bfq_wr_max_time;
- dur = bfqd->RT_prod;
+ dur = bfqd->rate_dur_prod;
do_div(dur, bfqd->peak_rate);
/*
- * Limit duration between 3 and 13 seconds. Tests show that
- * higher values than 13 seconds often yield the opposite of
- * the desired result, i.e., worsen responsiveness by letting
- * non-interactive and non-soft-real-time applications
- * preserve weight raising for a too long time interval.
+ * Limit duration between 3 and 25 seconds. The upper limit
+ * has been conservatively set after the following worst case:
+ * on a QEMU/KVM virtual machine
+ * - running in a slow PC
+ * - with a virtual disk stacked on a slow low-end 5400rpm HDD
+ * - serving a heavy I/O workload, such as the sequential reading
+ * of several files
+ * mplayer took 23 seconds to start, if constantly weight-raised.
+ *
+ * As for higher values than that accomodating the above bad
+ * scenario, tests show that higher values would often yield
+ * the opposite of the desired result, i.e., would worsen
+ * responsiveness by allowing non-interactive applications to
+ * preserve weight raising for too long.
*
* On the other end, lower values than 3 seconds make it
* difficult for most interactive tasks to complete their jobs
* before weight-raising finishes.
*/
- if (dur > msecs_to_jiffies(13000))
- dur = msecs_to_jiffies(13000);
- else if (dur < msecs_to_jiffies(3000))
- dur = msecs_to_jiffies(3000);
-
- return dur;
+ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
}
/* switch back from soft real-time to interactive weight raising */
@@ -1393,15 +1363,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
}
/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
- return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
* Return the farthest past time instant according to jiffies
* macros.
*/
@@ -1545,7 +1506,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
in_burst = bfq_bfqq_in_large_burst(bfqq);
soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
!in_burst &&
- time_is_before_jiffies(bfqq->soft_rt_next_start);
+ time_is_before_jiffies(bfqq->soft_rt_next_start) &&
+ bfqq->dispatched == 0;
*interactive = !in_burst && idle_for_long_time;
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
@@ -1858,6 +1820,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
return ELEVATOR_NO_MERGE;
}
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
static void bfq_request_merged(struct request_queue *q, struct request *req,
enum elv_merge type)
{
@@ -1866,7 +1830,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
- struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_queue *bfqq = bfq_init_rq(req);
struct bfq_data *bfqd = bfqq->bfqd;
struct request *prev, *next_rq;
@@ -1891,14 +1855,25 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
}
}
+/*
+ * This function is called to notify the scheduler that the requests
+ * rq and 'next' have been merged, with 'next' going away. BFQ
+ * exploits this hook to address the following issue: if 'next' has a
+ * fifo_time lower that rq, then the fifo_time of rq must be set to
+ * the value of 'next', to not forget the greater age of 'next'.
+ *
+ * NOTE: in this function we assume that rq is in a bfq_queue, basing
+ * on that rq is picked from the hash table q->elevator->hash, which,
+ * in its turn, is filled only with I/O requests present in
+ * bfq_queues, while BFQ is in use for the request queue q. In fact,
+ * the function that fills this hash table (elv_rqhash_add) is called
+ * only by bfq_insert_request.
+ */
static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
{
- struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
- if (!RB_EMPTY_NODE(&rq->rb_node))
- goto end;
- spin_lock_irq(&bfqq->bfqd->lock);
+ struct bfq_queue *bfqq = bfq_init_rq(rq),
+ *next_bfqq = bfq_init_rq(next);
/*
* If next and rq belong to the same bfq_queue and next is older
@@ -1920,11 +1895,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
if (bfqq->next_rq == next)
bfqq->next_rq = rq;
- bfq_remove_request(q, next);
- bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags);
-
- spin_unlock_irq(&bfqq->bfqd->lock);
-end:
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
}
@@ -2506,37 +2476,15 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
/*
* Update parameters related to throughput and responsiveness, as a
* function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ * bfq_calc_max_budget(), and on the ref_wr_duration array.
*/
static void update_thr_responsiveness_params(struct bfq_data *bfqd)
{
- int dev_type = blk_queue_nonrot(bfqd->queue);
-
- if (bfqd->bfq_user_max_budget == 0)
+ if (bfqd->bfq_user_max_budget == 0) {
bfqd->bfq_max_budget =
bfq_calc_max_budget(bfqd);
-
- if (bfqd->device_speed == BFQ_BFQD_FAST &&
- bfqd->peak_rate < device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_SLOW;
- bfqd->RT_prod = R_slow[dev_type] *
- T_slow[dev_type];
- } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
- bfqd->peak_rate > device_speed_thresh[dev_type]) {
- bfqd->device_speed = BFQ_BFQD_FAST;
- bfqd->RT_prod = R_fast[dev_type] *
- T_fast[dev_type];
+ bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget);
}
-
- bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
- dev_type == 0 ? "ROT" : "NONROT",
- bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
- bfqd->device_speed == BFQ_BFQD_FAST ?
- (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
- (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
- (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
- BFQ_RATE_SHIFT);
}
static void bfq_reset_rate_computation(struct bfq_data *bfqd,
@@ -3266,23 +3214,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bfq_bfqq_softrt_next_start(bfqd, bfqq);
else {
/*
- * The application is still waiting for the
- * completion of one or more requests:
- * prevent it from possibly being incorrectly
- * deemed as soft real-time by setting its
- * soft_rt_next_start to infinity. In fact,
- * without this assignment, the application
- * would be incorrectly deemed as soft
- * real-time if:
- * 1) it issued a new request before the
- * completion of all its in-flight
- * requests, and
- * 2) at that time, its soft_rt_next_start
- * happened to be in the past.
- */
- bfqq->soft_rt_next_start =
- bfq_greatest_from_now();
- /*
* Schedule an update of soft_rt_next_start to when
* the task may be discovered to be isochronous.
*/
@@ -4540,14 +4471,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
unsigned int cmd_flags) {}
#endif
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
@@ -4562,24 +4491,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
spin_lock_irq(&bfqd->lock);
+ bfqq = bfq_init_rq(rq);
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
- } else {
- if (WARN_ON_ONCE(!bfqq)) {
- /*
- * This should never happen. Most likely rq is
- * a requeued regular request, being
- * re-inserted without being first
- * re-prepared. Do a prepare, to avoid
- * failure.
- */
- bfq_prepare_request(rq, rq->bio);
- bfqq = RQ_BFQQ(rq);
- }
-
+ } else { /* bfqq is assumed to be non null here */
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
@@ -4778,8 +4696,8 @@ static void bfq_finish_requeue_request(struct request *rq)
if (rq->rq_flags & RQF_STARTED)
bfqg_stats_update_completion(bfqq_group(bfqq),
- rq_start_time_ns(rq),
- rq_io_start_time_ns(rq),
+ rq->start_time_ns,
+ rq->io_start_time_ns,
rq->cmd_flags);
if (likely(rq->rq_flags & RQF_STARTED)) {
@@ -4922,11 +4840,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
}
/*
- * Allocate bfq data structures associated with this request.
+ * Only reset private fields. The actual request preparation will be
+ * performed by bfq_init_rq, when rq is either inserted or merged. See
+ * comments on bfq_init_rq for the reason behind this delayed
+ * preparation.
*/
static void bfq_prepare_request(struct request *rq, struct bio *bio)
{
+ /*
+ * Regardless of whether we have an icq attached, we have to
+ * clear the scheduler pointers, as they might point to
+ * previously allocated bic/bfqq structs.
+ */
+ rq->elv.priv[0] = rq->elv.priv[1] = NULL;
+}
+
+/*
+ * If needed, init rq, allocate bfq data structures associated with
+ * rq, and increment reference counters in the destination bfq_queue
+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is
+ * not associated with any bfq_queue.
+ *
+ * This function is invoked by the functions that perform rq insertion
+ * or merging. One may have expected the above preparation operations
+ * to be performed in bfq_prepare_request, and not delayed to when rq
+ * is inserted or merged. The rationale behind this delayed
+ * preparation is that, after the prepare_request hook is invoked for
+ * rq, rq may still be transformed into a request with no icq, i.e., a
+ * request not associated with any queue. No bfq hook is invoked to
+ * signal this tranformation. As a consequence, should these
+ * preparation operations be performed when the prepare_request hook
+ * is invoked, and should rq be transformed one moment later, bfq
+ * would end up in an inconsistent state, because it would have
+ * incremented some queue counters for an rq destined to
+ * transformation, without any chance to correctly lower these
+ * counters back. In contrast, no transformation can still happen for
+ * rq after rq has been inserted or merged. So, it is safe to execute
+ * these preparation operations when rq is finally inserted or merged.
+ */
+static struct bfq_queue *bfq_init_rq(struct request *rq)
+{
struct request_queue *q = rq->q;
+ struct bio *bio = rq->bio;
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_io_cq *bic;
const int is_sync = rq_is_sync(rq);
@@ -4934,20 +4889,21 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
+ if (unlikely(!rq->elv.icq))
+ return NULL;
+
/*
- * Even if we don't have an icq attached, we should still clear
- * the scheduler pointers, as they might point to previously
- * allocated bic/bfqq structs.
+ * Assuming that elv.priv[1] is set only if everything is set
+ * for this rq. This holds true, because this function is
+ * invoked only for insertion or merging, and, after such
+ * events, a request cannot be manipulated any longer before
+ * being removed from bfq.
*/
- if (!rq->elv.icq) {
- rq->elv.priv[0] = rq->elv.priv[1] = NULL;
- return;
- }
+ if (rq->elv.priv[1])
+ return rq->elv.priv[1];
bic = icq_to_bic(rq->elv.icq);
- spin_lock_irq(&bfqd->lock);
-
bfq_check_ioprio_change(bic, bio);
bfq_bic_update_cgroup(bic, bio);
@@ -5006,7 +4962,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
if (unlikely(bfq_bfqq_just_created(bfqq)))
bfq_handle_burst(bfqd, bfqq);
- spin_unlock_irq(&bfqd->lock);
+ return bfqq;
}
static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
@@ -5105,6 +5061,64 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
}
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function. Return minimum shallow depth we'll use.
+ */
+static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+ struct sbitmap_queue *bt)
+{
+ unsigned int i, j, min_shallow = UINT_MAX;
+
+ /*
+ * In-word depths if no bfq_queue is being weight-raised:
+ * leaving 25% of tags only for sync reads.
+ *
+ * In next formulas, right-shift the value
+ * (1U<<bt->sb.shift), instead of computing directly
+ * (1U<<(bt->sb.shift - something)), to be robust against
+ * any possible value of bt->sb.shift, without having to
+ * limit 'something'.
+ */
+ /* no more than 50% of tags for async I/O */
+ bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+ /*
+ * no more than 75% of tags for sync writes (25% extra tags
+ * w.r.t. async I/O, to prevent async I/O from starving sync
+ * writes)
+ */
+ bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+
+ /*
+ * In-word depths in case some bfq_queue is being weight-
+ * raised: leaving ~63% of tags for sync reads. This is the
+ * highest percentage for which, in our tests, application
+ * start-up times didn't suffer from any regression due to tag
+ * shortage.
+ */
+ /* no more than ~18% of tags for async I/O */
+ bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+ /* no more than ~37% of tags for sync writes (~20% extra tags) */
+ bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
+
+ return min_shallow;
+}
+
+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+ unsigned int min_shallow;
+
+ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ return 0;
+}
+
static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
@@ -5242,14 +5256,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->wr_busy_queues = 0;
/*
- * Begin by assuming, optimistically, that the device is a
- * high-speed one, and that its peak rate is equal to 2/3 of
- * the highest reference rate.
+ * Begin by assuming, optimistically, that the device peak
+ * rate is equal to 2/3 of the highest reference rate.
*/
- bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
- T_fast[blk_queue_nonrot(bfqd->queue)];
- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
- bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] *
+ ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
+ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
spin_lock_init(&bfqd->lock);
@@ -5526,6 +5538,7 @@ static struct elevator_type iosched_bfq_mq = {
.requests_merged = bfq_requests_merged,
.request_merged = bfq_request_merged,
.has_work = bfq_has_work,
+ .init_hctx = bfq_init_hctx,
.init_sched = bfq_init_queue,
.exit_sched = bfq_exit_queue,
},
@@ -5556,8 +5569,8 @@ static int __init bfq_init(void)
/*
* Times to load large popular applications for the typical
* systems installed on the reference devices (see the
- * comments before the definitions of the next two
- * arrays). Actually, we use slightly slower values, as the
+ * comments before the definition of the next
+ * array). Actually, we use slightly lower values, as the
* estimated peak rate tends to be smaller than the actual
* peak rate. The reason for this last fact is that estimates
* are computed over much shorter time intervals than the long
@@ -5566,25 +5579,8 @@ static int __init bfq_init(void)
* scheduler cannot rely on a peak-rate-evaluation workload to
* be run for a long time.
*/
- T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
- T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
- T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
- T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
- /*
- * Thresholds that determine the switch between speed classes
- * (see the comments before the definition of the array
- * device_speed_thresh). These thresholds are biased towards
- * transitions to the fast class. This is safer than the
- * opposite bias. In fact, a wrong transition to the slow
- * class results in short weight-raising periods, because the
- * speed of the device then tends to be higher that the
- * reference peak rate. On the opposite end, a wrong
- * transition to the fast class tends to increase
- * weight-raising periods, because of the opposite reason.
- */
- device_speed_thresh[0] = (4 * R_slow[0]) / 3;
- device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */
ret = elv_register(&iosched_bfq_mq);
if (ret)