From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:04 -0700 Subject: kthread: add a mechanism to store cgroup info kthread usually runs jobs on behalf of other threads. The jobs should be charged to cgroup of original threads. But the jobs run in a kthread, where we lose the cgroup context of original threads. The patch adds a machanism to record cgroup info of original threads in kthread context. Later we can retrieve the cgroup info and attach the cgroup info to jobs. Since this mechanism is only required by kthread, we store the cgroup info in kthread data instead of generic task_struct. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kthread.h | 11 +++++++++ kernel/kthread.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 82e197eeac91f..bd4369c83dfba 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -3,6 +3,7 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include #include +#include __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +#ifdef CONFIG_CGROUPS +void kthread_associate_blkcg(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *kthread_blkcg(void); +#else +static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } +static inline struct cgroup_subsys_state *kthread_blkcg(void) +{ + return NULL; +} +#endif #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf824272..b011ea08967fc 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,7 +20,6 @@ #include #include #include -#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -47,6 +46,9 @@ struct kthread { void *data; struct completion parked; struct completion exited; +#ifdef CONFIG_CGROUPS + struct cgroup_subsys_state *blkcg_css; +#endif }; enum KTHREAD_BITS { @@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k) void free_kthread_struct(struct task_struct *k) { + struct kthread *kthread; + /* * Can be NULL if this kthread was created by kernel_thread() * or if kmalloc() in kthread() failed. */ - kfree(to_kthread(k)); + kthread = to_kthread(k); +#ifdef CONFIG_CGROUPS + WARN_ON_ONCE(kthread && kthread->blkcg_css); +#endif + kfree(kthread); } /** @@ -216,6 +224,9 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); +#ifdef CONFIG_CGROUPS + self->blkcg_css = NULL; +#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker) kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); + +#ifdef CONFIG_CGROUPS +/** + * kthread_associate_blkcg - associate blkcg to current kthread + * @css: the cgroup info + * + * Current thread must be a kthread. The thread is running jobs on behalf of + * other threads. In some cases, we expect the jobs attach cgroup info of + * original threads instead of that of current thread. This function stores + * original thread's cgroup info in current kthread context for later + * retrieval. + */ +void kthread_associate_blkcg(struct cgroup_subsys_state *css) +{ + struct kthread *kthread; + + if (!(current->flags & PF_KTHREAD)) + return; + kthread = to_kthread(current); + if (!kthread) + return; + + if (kthread->blkcg_css) { + css_put(kthread->blkcg_css); + kthread->blkcg_css = NULL; + } + if (css) { + css_get(css); + kthread->blkcg_css = css; + } +} +EXPORT_SYMBOL(kthread_associate_blkcg); + +/** + * kthread_blkcg - get associated blkcg css of current kthread + * + * Current thread must be a kthread. + */ +struct cgroup_subsys_state *kthread_blkcg(void) +{ + struct kthread *kthread; + + if (current->flags & PF_KTHREAD) { + kthread = to_kthread(current); + if (kthread) + return kthread->blkcg_css; + } + return NULL; +} +EXPORT_SYMBOL(kthread_blkcg); +#endif -- cgit v1.2.3 From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:05 -0700 Subject: blkcg: delete unused APIs Nobody uses the APIs right now. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/bio.c | 31 ------------------------------- include/linux/bio.h | 2 -- include/linux/blk-cgroup.h | 12 ------------ 3 files changed, 45 deletions(-) diff --git a/block/bio.c b/block/bio.c index b38e962fa83e7..8338304ea2563 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2032,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) } EXPORT_SYMBOL_GPL(bio_associate_blkcg); -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - - if (bio->bi_css) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - get_io_context_active(ioc); - bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, io_cgrp_id); - return 0; -} -EXPORT_SYMBOL_GPL(bio_associate_current); - /** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 275c91c995163..9c75f58f6a501 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -522,13 +522,11 @@ do { \ #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); -int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } -static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9d92153dd856f..0cfa8d2ef4d63 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return task_get_css(task, io_cgrp_id); -} - /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -735,12 +729,6 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return NULL; -} - #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From 902ec5b6de0678ac2effba0faaafdd1c3e7fcf2e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:06 -0700 Subject: block: make blkcg aware of kthread stored original cgroup info bio_blkcg is the only API to get cgroup info for a bio right now. If bio_blkcg finds current task is a kthread and has original blkcg associated, it will use the css instead of associating the bio to current task. This makes it possible that kthread dispatches bios on behalf of other threads. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0cfa8d2ef4d63..f57e54d645297 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, io_cgrp_id)); -} - static inline struct blkcg *bio_blkcg(struct bio *bio) { + struct cgroup_subsys_state *css; + if (bio && bio->bi_css) return css_to_blkcg(bio->bi_css); - return task_blkcg(current); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } /** -- cgit v1.2.3 From d4478e92d6186ce37947a36994de407c27446266 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 25 Sep 2017 13:07:22 -0600 Subject: block/loop: make loop cgroup aware loop block device handles IO in a separate thread. The actual IO dispatched isn't cloned from the IO loop device received, so the dispatched IO loses the cgroup context. I'm ignoring buffer IO case now, which is quite complicated. Making the loop thread aware cgroup context doesn't really help. The loop device only writes to a single file. In current writeback cgroup implementation, the file can only belong to one cgroup. For direct IO case, we could workaround the issue in theory. For example, say we assign cgroup1 5M/s BW for loop device and cgroup2 10M/s. We can create a special cgroup for loop thread and assign at least 15M/s for the underlayer disk. In this way, we correctly throttle the two cgroups. But this is tricky to setup. This patch tries to address the issue. We record bio's css in loop command. When loop thread is handling the command, we then use the API provided in patch 1 to set the css for current task. The bio layer will use the css for new IO (from patch 3). Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 13 +++++++++++++ drivers/block/loop.h | 1 + 2 files changed, 14 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 85de673346955..fd4eff5f5b76a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + if (cmd->css) + css_put(cmd->css); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } @@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_filp = file; cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; + if (cmd->css) + kthread_associate_blkcg(cmd->css); if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); @@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = call_read_iter(file, &cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); + kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); @@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } + /* always use the first bio's css */ +#ifdef CONFIG_CGROUPS + if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) { + cmd->css = cmd->rq->bio->bi_css; + css_get(cmd->css); + } else +#endif + cmd->css = NULL; kthread_queue_work(&lo->worker, &cmd->work); return BLK_STS_OK; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 1f39567029938..0f45416e4fcfe 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -72,6 +72,7 @@ struct loop_cmd { long ret; struct kiocb iocb; struct bio_vec *bvec; + struct cgroup_subsys_state *css; }; /* Support for loadable transfer modules */ -- cgit v1.2.3 From 9979d545c93653770984806e9bfcec1df53a9d3d Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 25 Sep 2017 19:10:07 +0200 Subject: block: cryptoloop - Fix build warning This patch fix the following build warning: drivers/block/cryptoloop.c:46:8: warning: variable 'cipher' set but not used [-Wunused-but-set-variable] Signed-off-by: Corentin Labbe Signed-off-by: Jens Axboe --- drivers/block/cryptoloop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 74e03aa537ad7..7033a4beda669 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c @@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) int cipher_len; int mode_len; char cms[LO_NAME_SIZE]; /* cipher-mode string */ - char *cipher; char *mode; char *cmsp = cms; /* c-m string pointer */ struct crypto_skcipher *tfm; @@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); cms[LO_NAME_SIZE - 1] = 0; - cipher = cmsp; cipher_len = strcspn(cmsp, "-"); mode = cmsp + cipher_len; -- cgit v1.2.3 From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- include/linux/kthread.h | 2 +- kernel/kthread.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fd4eff5f5b76a..bc8e61506968a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, } /* always use the first bio's css */ -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) { cmd->css = cmd->rq->bio->bi_css; css_get(cmd->css); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index bd4369c83dfba..fb201842c635d 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else diff --git a/kernel/kthread.c b/kernel/kthread.c index b011ea08967fc..f87cd8b4eb2a1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,7 +46,7 @@ struct kthread { void *data; struct completion parked; struct completion exited; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif }; @@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread && kthread->blkcg_css); #endif kfree(kthread); @@ -224,7 +224,7 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP self->blkcg_css = NULL; #endif current->vfork_done = &self->exited; @@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info -- cgit v1.2.3 From b3cffc3877a6c7edf17fd9c476f3e8d930bccec1 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 30 Sep 2017 09:49:21 +0800 Subject: null_blk: add "no_sched" module parameter add an option that disable io scheduler for null block device. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8042c26ea9e6e..bf2c8ca3242aa 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -154,6 +154,10 @@ enum { NULL_Q_MQ = 2, }; +static int g_no_sched; +module_param_named(no_sched, g_no_sched, int, S_IRUGO); +MODULE_PARM_DESC(no_sched, "No io scheduler"); + static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); @@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); set->flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + set->flags |= BLK_MQ_F_NO_SCHED; set->driver_data = NULL; if ((nullb && nullb->dev->blocking) || g_blocking) -- cgit v1.2.3 From 547248736ae54b73e713ee259a81ab3a637cb2f7 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 30 Sep 2017 15:01:48 +0800 Subject: blk-mq: remove unused function hctx_allow_merges since 9bddeb2a5b981 "blk-mq: make per-sw-queue bio merge as default .bio_merge" there is no caller for this function. Reviewed-by: Ming Lei Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 98a18609755e9..59687ed6561b9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1504,12 +1504,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, true); } -static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) -{ - return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !blk_queue_nomerges(hctx->queue); -} - static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) -- cgit v1.2.3 From 5385fa47d883dc7d51de622835427bc2558ed43c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 1 Oct 2017 01:26:21 -0600 Subject: blk-mq-tag: kill unused tag enums We don't have any notion of a tagging cache anymore, and haven't for a long time. Kill off the unused enums. Signed-off-by: Jens Axboe --- block/blk-mq-tag.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 5cb51e53cc035..5932a7ac7fc4e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -43,14 +43,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, return sbq_wait_ptr(bt, &hctx->wait_index); } -enum { - BLK_MQ_TAG_CACHE_MIN = 1, - BLK_MQ_TAG_CACHE_MAX = 64, -}; - enum { BLK_MQ_TAG_FAIL = -1U, - BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN, + BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; -- cgit v1.2.3 From 7beb2f845b715cb98584cf630e9a9d5b05501166 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 02:08:24 -0600 Subject: blk-mq: wire up completion notifier for laptop mode For some reason, the laptop mode IO completion notified was never wired up for blk-mq. Ensure that we trigger the callback appropriately, to arm the laptop mode flush timer. Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 59687ed6561b9..7f01d69879d66 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -476,6 +476,9 @@ void blk_mq_free_request(struct request *rq) if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) + laptop_io_completion(q->backing_dev_info); + wbt_done(q->rq_wb, &rq->issue_stat); clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); -- cgit v1.2.3 From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 05:40:16 -0600 Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL Instead of adding weird retry logic in that function, utilize __GFP_NOFAIL to ensure that the vm takes care of handling any potential retries appropriately. This means we don't have to call free_more_memory() from here. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/md/bitmap.c | 2 +- fs/buffer.c | 33 ++++++++++----------------------- fs/ntfs/aops.c | 2 +- fs/ntfs/mft.c | 2 +- include/linux/buffer_head.h | 2 +- 5 files changed, 14 insertions(+), 27 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d2121637b4abc..4d8ed74efadf8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<i_blkbits, 0); + bh = alloc_page_buffers(page, 1<i_blkbits, false); if (!bh) { ret = -ENOMEM; goto out; diff --git a/fs/buffer.c b/fs/buffer.c index 170df856bdb99..1234ae343aef0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -861,16 +861,19 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry) + bool retry) { struct buffer_head *bh, *head; + gfp_t gfp = GFP_NOFS; long offset; -try_again: + if (retry) + gfp |= __GFP_NOFAIL; + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = alloc_buffer_head(GFP_NOFS); + bh = alloc_buffer_head(gfp); if (!bh) goto no_grow; @@ -896,23 +899,7 @@ no_grow: } while (head); } - /* - * Return failure for non-async IO requests. Async IO requests - * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with - * partially complete buffers, so all were released above. - */ - if (!retry) - return NULL; - - /* We're _really_ low on memory. Now we just - * wait for old buffer heads to become free due to - * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are - * async buffer heads in use. - */ - free_more_memory(); - goto try_again; + return NULL; } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1021,7 +1008,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, /* * Allocate some buffers for this page */ - bh = alloc_page_buffers(page, size, 0); + bh = alloc_page_buffers(page, size, false); if (!bh) goto failed; @@ -1575,7 +1562,7 @@ void create_empty_buffers(struct page *page, { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, 1); + head = alloc_page_buffers(page, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -2638,7 +2625,7 @@ int nobh_write_begin(struct address_space *mapping, * Be careful: the buffer linked list is a NULL terminated one, rather * than the circular one we're used to. */ - head = alloc_page_buffers(page, blocksize, 0); + head = alloc_page_buffers(page, blocksize, false); if (!head) { ret = -ENOMEM; goto out_release; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index cc91856b5e2d9..3a2e509c77c57 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { spin_lock(&mapping->private_lock); if (unlikely(!page_has_buffers(page))) { spin_unlock(&mapping->private_lock); - bh = head = alloc_page_buffers(page, bh_size, 1); + bh = head = alloc_page_buffers(page, bh_size, true); spin_lock(&mapping->private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index b6f402194f02c..ee8392aee9f65 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, if (unlikely(!page_has_buffers(page))) { struct buffer_head *tail; - bh = head = alloc_page_buffers(page, blocksize, 1); + bh = head = alloc_page_buffers(page, blocksize, true); do { set_buffer_uptodate(bh); tail = bh; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae555eccf9..ae2d25f01b98e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -156,7 +156,7 @@ void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); int try_to_free_buffers(struct page *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry); + bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 94dc24c0c59a224a093f110060d01c2c620f275a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 05:45:36 -0600 Subject: buffer: grow_dev_page() should use __GFP_NOFAIL for all cases We currently use it for find_or_create_page(), which means that it cannot fail. Ensure we also pass in 'retry == true' to alloc_page_buffers(), which also ensure that it cannot fail. After this, there are no failure cases in grow_dev_page() that occur because of a failed memory allocation. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/buffer.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 1234ae343aef0..3b60cd8456db4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -988,8 +988,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, gfp_mask |= __GFP_NOFAIL; page = find_or_create_page(inode->i_mapping, index, gfp_mask); - if (!page) - return ret; BUG_ON(!PageLocked(page)); @@ -1008,9 +1006,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, /* * Allocate some buffers for this page */ - bh = alloc_page_buffers(page, size, false); - if (!bh) - goto failed; + bh = alloc_page_buffers(page, size, true); /* * Link the page to the buffers and initialise them. Take the -- cgit v1.2.3 From bc48f001de12225b6430a243504aa60b5ae8a91a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 06:08:57 -0600 Subject: buffer: eliminate the need to call free_more_memory() in __getblk_slow() Since the previous commit removed any case where grow_buffers() would return failure due to memory allocations, we can safely remove the case where we have to call free_more_memory() in this function. Since this is also the last user of free_more_memory(), kill it off completely. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/buffer.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 3b60cd8456db4..bff571dc7bc3b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -252,27 +252,6 @@ out: return ret; } -/* - * Kick the writeback threads then try to free up some ZONE_NORMAL memory. - */ -static void free_more_memory(void) -{ - struct zoneref *z; - int nid; - - wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); - yield(); - - for_each_online_node(nid) { - - z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL); - if (z->zone) - try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS, NULL); - } -} - /* * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. @@ -1086,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; - if (ret == 0) - free_more_memory(); } } -- cgit v1.2.3 From 9ba4b2dfafaa711b41cc2102b0e9a529f3981218 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Sep 2017 08:58:25 -0600 Subject: fs: kill 'nr_pages' argument from wakeup_flusher_threads() Everybody is passing in 0 now, let's get rid of the argument. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 9 ++++----- fs/sync.c | 2 +- include/linux/writeback.h | 2 +- mm/vmscan.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 245c430a2e418..bb6148dc6d245 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1947,12 +1947,12 @@ void wb_workfn(struct work_struct *work) } /* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. + * Wakeup the flusher threads to start writeback of all currently dirty pages */ -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) +void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; + long nr_pages; /* * If we are expecting writeback progress we must submit plugged IO. @@ -1960,8 +1960,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) if (blk_needs_flush_plug(current)) blk_schedule_flush_plug(current); - if (!nr_pages) - nr_pages = get_nr_dirty_pages(); + nr_pages = get_nr_dirty_pages(); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { diff --git a/fs/sync.c b/fs/sync.c index a576aa2e6b090..09f96a18dd930 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -108,7 +108,7 @@ SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; - wakeup_flusher_threads(0, WB_REASON_SYNC); + wakeup_flusher_threads(WB_REASON_SYNC); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d5815794416c9..1f9c6db5e29a0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,7 +189,7 @@ bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); +void wakeup_flusher_threads(enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 13d711dd87766..42a7fdd52d877 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1867,7 +1867,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * also allow kswapd to start writing pages during reclaim. */ if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(0, WB_REASON_VMSCAN); + wakeup_flusher_threads(WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); } -- cgit v1.2.3 From d31cd9d326f216fca9a602dddac254f668f5dcd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 13:28:18 -0600 Subject: writeback: switch wakeup_flusher_threads() to cyclic writeback We're writing back the full range of dirty pages on the devices, there's no point in making this special and not do normal range cyclic writeback. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bb6148dc6d245..65e6992d8719a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1971,7 +1971,7 @@ void wakeup_flusher_threads(enum wb_reason reason) list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - false, reason); + true, reason); } rcu_read_unlock(); } -- cgit v1.2.3 From 47410d88f665486bf91f02242ab5d5692b8887ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:25:03 -0600 Subject: writeback: remove 'range_cyclic' argument for wb_start_writeback() All the callers pass in 'true' for range_cyclic, so kill the argument. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- include/linux/backing-dev.h | 2 +- mm/page-writeback.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 65e6992d8719a..fe555bce886c1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -934,7 +934,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, #endif /* CONFIG_CGROUP_WRITEBACK */ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason) + enum wb_reason reason) { struct wb_writeback_work *work; @@ -955,7 +955,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; - work->range_cyclic = range_cyclic; + work->range_cyclic = 1; work->reason = reason; work->auto_free = 1; @@ -1971,7 +1971,7 @@ void wakeup_flusher_threads(enum wb_reason reason) list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - true, reason); + reason); } rcu_read_unlock(); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 854e1bdd0b2a4..0f63493de9e7d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -39,7 +39,7 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) } void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason); + enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b9c5cbe8eba0..dede533551232 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1994,8 +1994,8 @@ void laptop_mode_timer_fn(unsigned long data) rcu_read_lock(); list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) if (wb_has_dirty_io(wb)) - wb_start_writeback(wb, nr_pages, true, - WB_REASON_LAPTOP_TIMER); + wb_start_writeback(wb, nr_pages, + WB_REASON_LAPTOP_TIMER); rcu_read_unlock(); } -- cgit v1.2.3 From 595043e5f9ef1d8263bd9fda215cade489227491 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:26:59 -0600 Subject: writeback: provide a wakeup_flusher_threads_bdi() Similar to wakeup_flusher_threads(), except that we only wake up the flusher threads on the specified backing device. No functional changes in this patch. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 39 +++++++++++++++++++++++++++++---------- include/linux/writeback.h | 2 ++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fe555bce886c1..9f39829459dc0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1946,6 +1946,33 @@ void wb_workfn(struct work_struct *work) current->flags &= ~PF_SWAPWRITE; } +/* + * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, + * write back the whole world. + */ +static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + long nr_pages, enum wb_reason reason) +{ + struct bdi_writeback *wb; + + if (!bdi_has_dirty_io(bdi)) + return; + + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), + reason); +} + +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason) +{ + long nr_pages = get_nr_dirty_pages(); + + rcu_read_lock(); + __wakeup_flusher_threads_bdi(bdi, nr_pages, reason); + rcu_read_unlock(); +} + /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ @@ -1963,16 +1990,8 @@ void wakeup_flusher_threads(enum wb_reason reason) nr_pages = get_nr_dirty_pages(); rcu_read_lock(); - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - struct bdi_writeback *wb; - - if (!bdi_has_dirty_io(bdi)) - continue; - - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) - wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - reason); - } + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + __wakeup_flusher_threads_bdi(bdi, nr_pages, reason); rcu_read_unlock(); } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1f9c6db5e29a0..9c0091678af48 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -190,6 +190,8 @@ bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ -- cgit v1.2.3 From 0ab29fd0accf6e5b6ef5dbe2e0335da6687ce60c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:28:02 -0600 Subject: writeback: pass in '0' for nr_pages writeback in laptop mode Laptop mode really wants to writeback the number of dirty pages and inodes. Instead of calculating this in the caller, just pass in 0 and let wakeup_flusher_threads() handle it. Use the new wakeup_flusher_threads_bdi() instead of rolling our own. Acked-by: Johannes Weiner Tested-by: Chris Mason Signed-off-by: Jens Axboe Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- mm/page-writeback.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dede533551232..8d1fc593bce8f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1980,23 +1980,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void laptop_mode_timer_fn(unsigned long data) { struct request_queue *q = (struct request_queue *)data; - int nr_pages = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); - struct bdi_writeback *wb; - /* - * We want to write everything out, not just down to the dirty - * threshold - */ - if (!bdi_has_dirty_io(q->backing_dev_info)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) - if (wb_has_dirty_io(wb)) - wb_start_writeback(wb, nr_pages, - WB_REASON_LAPTOP_TIMER); - rcu_read_unlock(); + wakeup_flusher_threads_bdi(q->backing_dev_info, WB_REASON_LAPTOP_TIMER); } /* -- cgit v1.2.3 From 9dfb176fae57a1dea68531fd25e867037e4d9bac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:28:55 -0600 Subject: writeback: make wb_start_writeback() static We don't have any callers outside of fs-writeback.c anymore, make it private. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 4 ++-- include/linux/backing-dev.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9f39829459dc0..f1042061eaad7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -933,8 +933,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, #endif /* CONFIG_CGROUP_WRITEBACK */ -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason) +static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { struct wb_writeback_work *work; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0f63493de9e7d..157e950a70dc1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,8 +38,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); } -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From e8e8a0c6c9bfc0b320671166dd795f413f636773 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:31:22 -0600 Subject: writeback: move nr_pages == 0 logic to one location Now that we have no external callers of wb_start_writeback(), we can shuffle the passing in of 'nr_pages'. Everybody passes in 0 at this point, so just kill the argument and move the dirty count retrieval to that function. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f1042061eaad7..424577152eb57 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -933,8 +933,18 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, #endif /* CONFIG_CGROUP_WRITEBACK */ -static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason) +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + +static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) { struct wb_writeback_work *work; @@ -954,7 +964,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, } work->sync_mode = WB_SYNC_NONE; - work->nr_pages = nr_pages; + work->nr_pages = wb_split_bdi_pages(wb, get_nr_dirty_pages()); work->range_cyclic = 1; work->reason = reason; work->auto_free = 1; @@ -1814,17 +1824,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) return work; } -/* - * Add in the number of potentially dirty inodes, because each inode - * write can dirty pagecache in the underlying blockdev. - */ -static unsigned long get_nr_dirty_pages(void) -{ - return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + - get_nr_dirty_inodes(); -} - static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { @@ -1951,7 +1950,7 @@ void wb_workfn(struct work_struct *work) * write back the whole world. */ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, - long nr_pages, enum wb_reason reason) + enum wb_reason reason) { struct bdi_writeback *wb; @@ -1959,17 +1958,14 @@ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, return; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) - wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - reason); + wb_start_writeback(wb, reason); } void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { - long nr_pages = get_nr_dirty_pages(); - rcu_read_lock(); - __wakeup_flusher_threads_bdi(bdi, nr_pages, reason); + __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } @@ -1979,7 +1975,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; - long nr_pages; /* * If we are expecting writeback progress we must submit plugged IO. @@ -1987,11 +1982,9 @@ void wakeup_flusher_threads(enum wb_reason reason) if (blk_needs_flush_plug(current)) blk_schedule_flush_plug(current); - nr_pages = get_nr_dirty_pages(); - rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - __wakeup_flusher_threads_bdi(bdi, nr_pages, reason); + __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } -- cgit v1.2.3 From aac8d41cd438f25bf3110fc6b98f1d16d7dbc169 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:31:55 -0600 Subject: writeback: only allow one inflight and pending full flush When someone calls wakeup_flusher_threads() or wakeup_flusher_threads_bdi(), they schedule writeback of all dirty pages in the system (or on that bdi). If we are tight on memory, we can get tons of these queued from kswapd/vmscan. This causes (at least) two problems: 1) We consume a ton of memory just allocating writeback work items. We've seen as much as 600 million of these writeback work items pending. That's a lot of memory to pointlessly hold hostage, while the box is under memory pressure. 2) We spend so much time processing these work items, that we introduce a softlockup in writeback processing. This is because each of the writeback work items don't end up doing any work (it's hard when you have millions of identical ones coming in to the flush machinery), so we just sit in a tight loop pulling work items and deleting/freeing them. Fix this by adding a 'start_all' bit to the writeback structure, and set that when someone attempts to flush all dirty pages. The bit is cleared when we start writeback on that work item. If the bit is already set when we attempt to queue !nr_pages writeback, then we simply ignore it. This provides us one full flush in flight, with one pending as well, and makes for more efficient handling of this type of writeback. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 25 +++++++++++++++++++++++++ include/linux/backing-dev-defs.h | 1 + 2 files changed, 26 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 424577152eb57..399619c975679 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,6 +53,7 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ + unsigned int start_all:1; /* nr_pages == 0 (all) writeback */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -951,6 +952,20 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) if (!wb_has_dirty_io(wb)) return; + /* + * All callers of this function want to start writeback of all + * dirty pages. Places like vmscan can call this at a very + * high frequency, causing pointless allocations of tons of + * work items and keeping the flusher threads busy retrieving + * that work. Ensure that we only allow one of them pending and + * inflight at the time. It doesn't matter if we race a little + * bit on this, so use the faster separate test/set bit variants. + */ + if (test_bit(WB_start_all, &wb->state)) + return; + + set_bit(WB_start_all, &wb->state); + /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback @@ -958,6 +973,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (!work) { + clear_bit(WB_start_all, &wb->state); trace_writeback_nowork(wb); wb_wakeup(wb); return; @@ -968,6 +984,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) work->range_cyclic = 1; work->reason = reason; work->auto_free = 1; + work->start_all = 1; wb_queue_work(wb, work); } @@ -1821,6 +1838,14 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) list_del_init(&work->list); } spin_unlock_bh(&wb->work_lock); + + /* + * Once we start processing a work item that had !nr_pages, + * clear the wb state bit for that so we can allow more. + */ + if (work && work->start_all) + clear_bit(WB_start_all, &wb->state); + return work; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 866c433e7d322..420de5c7c7f9a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -24,6 +24,7 @@ enum wb_state { WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ + WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_congested_state { -- cgit v1.2.3 From 4baa8bb13f41307f3eb62fe91f93a1a798ebef53 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:00 +0200 Subject: block, bfq: fix wrong init of saved start time for weight raising This commit fixes a bug that causes bfq to fail to guarantee a high responsiveness on some drives, if there is heavy random read+write I/O in the background. More precisely, such a failure allowed this bug to be found [1], but the bug may well cause other yet unreported anomalies. BFQ raises the weight of the bfq_queues associated with soft real-time applications, to privilege the I/O, and thus reduce latency, for these applications. This mechanism is named soft-real-time weight raising in BFQ. A soft real-time period may happen to be nested into an interactive weight raising period, i.e., it may happen that, when a bfq_queue switches to a soft real-time weight-raised state, the bfq_queue is already being weight-raised because deemed interactive too. In this case, BFQ saves in a special variable wr_start_at_switch_to_srt, the time instant when the interactive weight-raising period started for the bfq_queue, i.e., the time instant when BFQ started to deem the bfq_queue interactive. This value is then used to check whether the interactive weight-raising period would still be in progress when the soft real-time weight-raising period ends. If so, interactive weight raising is restored for the bfq_queue. This restore is useful, in particular, because it prevents bfq_queues from losing their interactive weight raising prematurely, as a consequence of spurious, short-lived soft real-time weight-raising periods caused by wrong detections as soft real-time. If, instead, a bfq_queue switches to soft-real-time weight raising while it *is not* already in an interactive weight-raising period, then the variable wr_start_at_switch_to_srt has no meaning during the following soft real-time weight-raising period. Unfortunately the handling of this case is wrong in BFQ: not only the variable is not flagged somehow as meaningless, but it is also set to the time when the switch to soft real-time weight-raising occurs. This may cause an interactive weight-raising period to be considered mistakenly as still in progress, and thus a spurious interactive weight-raising period to start for the bfq_queue, at the end of the soft-real-time weight-raising period. In particular the spurious interactive weight-raising period will be considered as still in progress, if the soft-real-time weight-raising period does not last very long. The bfq_queue will then be wrongly privileged and, if I/O bound, will unjustly steal bandwidth to truly interactive or soft real-time bfq_queues, harming responsiveness and low latency. This commit fixes this issue by just setting wr_start_at_switch_to_srt to minus infinity (farthest past time instant according to jiffies macros): when the soft-real-time weight-raising period ends, certainly no interactive weight-raising period will be considered as still in progress. [1] Background I/O Type: Random - Background I/O mix: Reads and writes - Application to start: LibreOffice Writer in http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Tested-by: Mirko Montanari Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a4783da90ba88..c25955c25e033 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1202,6 +1202,24 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned int old_wr_coeff, @@ -1216,7 +1234,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { - bfqq->wr_start_at_switch_to_srt = jiffies; + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -2897,24 +2927,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. -- cgit v1.2.3 From 3e2bdd6dff239afd8386e8758eee69ad61e5a3d6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:01 +0200 Subject: block, bfq: check and switch back to interactive wr also on queue split As already explained in the message of commit "block, bfq: fix wrong init of saved start time for weight raising", if a soft real-time weight-raising period happens to be nested in a larger interactive weight-raising period, then BFQ restores the interactive weight raising at the end of the soft real-time weight raising. In particular, BFQ checks whether the latter has ended only on request dispatches. Unfortunately, the above scheme fails to restore interactive weight raising in the following corner case: if a bfq_queue, say Q, 1) Is merged with another bfq_queue while it is in a nested soft real-time weight-raising period. The weight-raising state of Q is then saved, and not considered any longer until a split occurs. 2) Is split from the other bfq_queue(s) at a time instant when its soft real-time weight raising is already finished. On the split, while resuming the previous, soft real-time weight-raised state of the bfq_queue Q, BFQ checks whether the current soft real-time weight-raising period is actually over. If so, BFQ switches weight raising off for Q, *without* checking whether the soft real-time period was actually nested in a non-yet-finished interactive weight-raising period. This commit addresses this issue by adding the above missing check in bfq_queue splits, and restoring interactive weight raising if needed. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 87 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c25955c25e033..33b63bc4a64b6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -724,6 +724,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, } } +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); + + return dur; +} + +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) @@ -750,10 +788,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr"); - - bfqq->wr_coeff = 1; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + } } /* make sure weight will be updated, however we got here */ @@ -1173,35 +1217,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return wr_or_deserves_wr; } -static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -{ - u64 dur; - - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - - /* - * Limit duration between 3 and 13 seconds. Tests show that - * higher values than 13 seconds often yield the opposite of - * the desired result, i.e., worsen responsiveness by letting - * non-interactive and non-soft-real-time applications - * preserve weight raising for a too long time interval. - * - * On the other end, lower values than 3 seconds make it - * difficult for most interactive tasks to complete their jobs - * before weight-raising finishes. - */ - if (dur > msecs_to_jiffies(13000)) - dur = msecs_to_jiffies(13000); - else if (dur < msecs_to_jiffies(3000)) - dur = msecs_to_jiffies(3000); - - return dur; -} - /* * Return the farthest future time instant according to jiffies * macros. @@ -3501,11 +3516,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; + switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } } -- cgit v1.2.3 From 894df937e06a56ed6f054a75a416aff84147c5a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:02 +0200 Subject: block, bfq: let early-merged queues be weight-raised on split too A just-created bfq_queue, say Q, may happen to be merged with another bfq_queue on the very first invocation of the function __bfq_insert_request. In such a case, even if Q would clearly deserve interactive weight raising (as it has just been created), the function bfq_add_request does not make it to be invoked for Q, and thus to activate weight raising for Q. As a consequence, when the state of Q is saved for a possible future restore, after a split of Q from the other bfq_queue(s), such a state happens to be (unjustly) non-weight-raised. Then the bfq_queue will not enjoy any weight raising on the split, even if should still be in an interactive weight-raising period when the split occurs. This commit solves this problem as follows, for a just-created bfq_queue that is being early-merged: it stores directly, in the saved state of the bfq_queue, the weight-raising state that would have been assigned to the bfq_queue if not early-merged. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 33b63bc4a64b6..115747fe43c88 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2061,10 +2061,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq))) { + /* + * bfqq being merged right after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } } static void @@ -4150,7 +4167,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) new_bfqq->allocated++; bfqq->allocated--; new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); /* * If the bic associated with the process * issuing this request still points to bfqq @@ -4162,6 +4178,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq -- cgit v1.2.3 From 7cb04004fa371a626c1a5ebe6d977f70285759ed Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 21 Sep 2017 11:04:03 +0200 Subject: block, bfq: decrease burst size when queues in burst exit If many queues belonging to the same group happen to be created shortly after each other, then the concurrent processes associated with these queues have typically a common goal, and they get it done as soon as possible if not hampered by device idling. Examples are processes spawned by git grep, or by systemd during boot. As for device idling, this mechanism is currently necessary for weight raising to succeed in its goal: privileging I/O. In view of these facts, BFQ does not provide the above queues with either weight raising or device idling. On the other hand, a burst of queue creations may be caused also by the start-up of a complex application. In this case, these queues need usually to be served one after the other, and as quickly as possible, to maximise responsiveness. Therefore, in this case the best strategy is to weight-raise all the queues created during the burst, i.e., the exact opposite of the strategy for the above case. To distinguish between the two cases, BFQ uses an empirical burst-size threshold, found through extensive tests and monitoring of daily usage. Only large bursts, i.e., burst with a size above this threshold, are considered as generated by a high number of parallel processes. In this respect, upstart-based boot proved to be rather hard to detect as generating a large burst of queue creations, because with upstart most of the queues created in a burst exit *before* the next queues in the same burst are created. To address this issue, I changed the burst-detection mechanism so as to not decrease the size of the current burst even if one of the queues in the burst is eliminated. Unfortunately, this missing decrease causes false positives on very fast systems: on the start-up of a complex application, such as libreoffice writer, so many queues are created, served and exited shortly after each other, that a large burst of queue creations is wrongly detected as occurring. These false positives just disappear if the size of a burst is decreased when one of the queues in the burst exits. This commit restores the missing burst-size decrease, relying of the fact that upstart is apparently unlikely to be used on systems running this and future versions of the kernel. Signed-off-by: Paolo Valente Signed-off-by: Mauro Andreolini Signed-off-by: Angelo Ruocco Tested-by: Mirko Montanari Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 115747fe43c88..70f9177c4f5b2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3725,16 +3725,10 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->ref) return; - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ + if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); + bfqq->bfqd->burst_size--; + } kmem_cache_free(bfq_pool, bfqq); #ifdef CONFIG_BFQ_GROUP_IOSCHED -- cgit v1.2.3 From 9c9883744dda1cc38339a448dd8435140537027e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Oct 2017 10:47:00 +0200 Subject: block: move __elv_next_request to blk-core.c No need to have this helper inline in a header. Also drop the __ prefix. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++++++++-- block/blk.h | 39 --------------------------------------- 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 048be4aa60244..14f7674fa0b14 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2517,6 +2517,45 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_unlock(); } +static struct request *elv_next_request(struct request_queue *q) +{ + struct request *rq; + struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); + + WARN_ON_ONCE(q->mq_ops); + + while (1) { + if (!list_empty(&q->queue_head)) { + rq = list_entry_rq(q->queue_head.next); + return rq; + } + + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (fq->flush_pending_idx != fq->flush_running_idx && + !queue_flush_queueable(q)) { + fq->flush_queue_delayed = 1; + return NULL; + } + if (unlikely(blk_queue_bypass(q)) || + !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) + return NULL; + } +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2538,8 +2577,7 @@ struct request *blk_peek_request(struct request_queue *q) lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); - while ((rq = __elv_next_request(q)) != NULL) { - + while ((rq = elv_next_request(q)) != NULL) { rq = blk_pm_peek_request(q, rq); if (!rq) break; diff --git a/block/blk.h b/block/blk.h index fcb9775b997d2..fda5a4632aba3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -148,45 +148,6 @@ static inline void blk_clear_rq_complete(struct request *rq) void blk_insert_flush(struct request *rq); -static inline struct request *__elv_next_request(struct request_queue *q) -{ - struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - WARN_ON_ONCE(q->mq_ops); - - while (1) { - if (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - return rq; - } - - /* - * Flush request is running and flush request isn't queueable - * in the drive, we can hold the queue till flush request is - * finished. Even we don't do this, driver can't dispatch next - * requests and will requeue them. And this can improve - * throughput too. For example, we have request flush1, write1, - * flush 2. flush1 is dispatched, then queue is hold, write1 - * isn't inserted to queue. After flush1 is finished, flush2 - * will be dispatched. Since disk cache is already clean, - * flush2 will be finished very soon, so looks like flush2 is - * folded to flush1. - * Since the queue is hold, a flag is set to indicate the queue - * should be restarted later. Please see flush_end_io() for - * details. - */ - if (fq->flush_pending_idx != fq->flush_running_idx && - !queue_flush_queueable(q)) { - fq->flush_queue_delayed = 1; - return NULL; - } - if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) - return NULL; - } -} - static inline void elv_activate_rq(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; -- cgit v1.2.3 From 8ae4e4477d8f5cc7736816a0492f82934ca32ab7 Mon Sep 17 00:00:00 2001 From: Marc Olson Date: Wed, 6 Sep 2017 17:23:56 -0700 Subject: nvme: update timeout module parameter type The underlying blk_mq_tag_set, and request timeout parameters support an unsigned int. Extend the size of the nvme module parameters for io and admin commands to match. Signed-off-by: Marc Olson Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- drivers/nvme/host/nvme.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb2aad0786373..26c8913435b25 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -34,13 +34,13 @@ #define NVME_MINORS (1U << MINORBITS) -unsigned char admin_timeout = 60; -module_param(admin_timeout, byte, 0644); +unsigned int admin_timeout = 60; +module_param(admin_timeout, uint, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); EXPORT_SYMBOL_GPL(admin_timeout); -unsigned char nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +unsigned int nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, uint, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); EXPORT_SYMBOL_GPL(nvme_io_timeout); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d3f3c44475157..df787f39f4c1e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -21,10 +21,10 @@ #include #include -extern unsigned char nvme_io_timeout; +extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) -extern unsigned char admin_timeout; +extern unsigned int admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -- cgit v1.2.3 From 786456325b9f05843ec722269a3ccd77c31b9c6d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Sep 2017 18:04:43 -0700 Subject: nvme: use menu Kconfig interface Add a menu interface for NVME host and target support so that it is presented to users more like other Kconfig symbols. This makes the Device Driver menu less cluttered (easier to read) and keeps all of these symbols grouped together. Signed-off-by: Randy Dunlap Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index b7c78a5b1f7a7..04008e0bbe817 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig @@ -1,2 +1,6 @@ +menu "NVME Support" + source "drivers/nvme/host/Kconfig" source "drivers/nvme/target/Kconfig" + +endmenu -- cgit v1.2.3 From 3375e29c280894f8d5e65c6920630a42db069599 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 11 Sep 2017 16:14:50 -0700 Subject: nvmet: bump NVMET_NR_QUEUES to 128 Raise the max number of IO queues to 128. There are several hosts with more than 64 cpus/threads. Signed-off-by: James Smart Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7b8e20adf7602..e342f02845c19 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -314,7 +314,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_QUEUE_SIZE 1024 -#define NVMET_NR_QUEUES 64 +#define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE #define NVMET_KAS 10 #define NVMET_DISC_KATO 120 -- cgit v1.2.3 From d1f1071f8151355ec149117238e9c5aa6404ca9f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 24 Sep 2017 16:15:55 +0300 Subject: nvme-fabrics: request transport module Help userspace to make sure transport module is loaded. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 555c976cc2ee7..8bca36a469245 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -841,6 +841,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) if (ret) goto out_free_opts; + + request_module("nvme-%s", opts->transport); + /* * Check the generic options first as we need a valid transport for * the lookup below. Then clear the generic flags so that transport -- cgit v1.2.3 From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:42 -0700 Subject: nvme-fc: add uevent for auto-connect To support auto-connecting to FC-NVME devices upon their dynamic appearance, add a uevent that can kick off connection scripts. uevent is posted against the fc_udev device. patch set tested with the following rule to kick an nvme-cli connect-all for the FC initiator and FC target ports. This is just an example for testing and not intended for real life use. ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \ ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \ RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'" I will post proposed udev/systemd scripts for possible kernel support. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 49 ++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-fc-driver.h | 2 ++ 2 files changed, 51 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index af075e9989449..f546eecb1f82b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -452,6 +452,36 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) } EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); +/* + * TRADDR strings, per FC-NVME are fixed format: + * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters + * udev event will only differ by prefix of what field is + * being specified: + * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters + * 19 + 43 + null_fudge = 64 characters + */ +#define FCNVME_TRADDR_LENGTH 64 + +static void +nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, + struct nvme_fc_rport *rport) +{ + char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/ + char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/ + char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL }; + + if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY)) + return; + + snprintf(hostaddr, sizeof(hostaddr), + "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx", + lport->localport.node_name, lport->localport.port_name); + snprintf(tgtaddr, sizeof(tgtaddr), + "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx", + rport->remoteport.node_name, rport->remoteport.port_name); + kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -516,6 +546,8 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, list_add_tail(&newrec->endp_list, &lport->endp_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); + nvme_fc_signal_discovery_scan(lport, newrec); + *portptr = &newrec->remoteport; return 0; @@ -634,6 +666,23 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) } EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); +/** + * nvme_fc_rescan_remoteport - transport entry point called by an + * LLDD to request a nvme device rescan. + * @remoteport: pointer to the (registered) remote port that is to be + * rescanned. + * + * Returns: N/A + */ +void +nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(remoteport); + + nvme_fc_signal_discovery_scan(rport->lport, rport); +} +EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); + /* *********************** FC-NVME DMA Handling **************************** */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index a726f96010d59..4ea03b9a5c8cb 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -446,6 +446,8 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); +void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); + /* -- cgit v1.2.3 From 5f5685569ae8fccb0344373d823f2e4c59bb3d8e Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:41 -0700 Subject: nvme-fc: create fc class and transport device Added a new fc class and a device node for udev events under it. I expect the fc class will eventually be the location where the FC SCSI and FC NVME merge in the future. Therefore names are kept somewhat generic. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f546eecb1f82b..2dc9932e48182 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -213,6 +213,13 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt); +/* + * These items are short-term. They will eventually be moved into + * a generic FC class. See comments in module init. + */ +static struct class *fc_class; +static struct device *fc_udev_device; + /* *********************** FC-NVME Port Management ************************ */ @@ -3046,7 +3053,50 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { - return nvmf_register_transport(&nvme_fc_transport); + int ret; + + /* + * NOTE: + * It is expected that in the future the kernel will combine + * the FC-isms that are currently under scsi and now being + * added to by NVME into a new standalone FC class. The SCSI + * and NVME protocols and their devices would be under this + * new FC class. + * + * As we need something to post FC-specific udev events to, + * specifically for nvme probe events, start by creating the + * new device class. When the new standalone FC class is + * put in place, this code will move to a more generic + * location for the class. + */ + fc_class = class_create(THIS_MODULE, "fc"); + if (IS_ERR(fc_class)) { + pr_err("couldn't register class fc\n"); + return PTR_ERR(fc_class); + } + + /* + * Create a device for the FC-centric udev events + */ + fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL, + "fc_udev_device"); + if (IS_ERR(fc_udev_device)) { + pr_err("couldn't create fc_udev device!\n"); + ret = PTR_ERR(fc_udev_device); + goto out_destroy_class; + } + + ret = nvmf_register_transport(&nvme_fc_transport); + if (ret) + goto out_destroy_device; + + return 0; + +out_destroy_device: + device_destroy(fc_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(fc_class); + return ret; } static void __exit nvme_fc_exit_module(void) @@ -3059,6 +3109,9 @@ static void __exit nvme_fc_exit_module(void) ida_destroy(&nvme_fc_local_port_cnt); ida_destroy(&nvme_fc_ctrl_cnt); + + device_destroy(fc_class, MKDEV(0, 0)); + class_destroy(fc_class); } module_init(nvme_fc_init_module); -- cgit v1.2.3 From a7af0af32171c17d881e3e58b0925c4a44fb5a42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Sep 2017 10:00:22 +0200 Subject: blk-mq: attempt to fix atomic flag memory ordering Attempt to untangle the ordering in blk-mq. The patch introducing the single smp_mb__before_atomic() is obviously broken in that it doesn't clearly specify a pairing barrier and an obtained guarantee. The comment is further misleading in that it hints that the deadline store and the COMPLETE store also need to be ordered, but AFAICT there is no such dependency. However what does appear to be important is the clear happening _after_ the store, and that worked by pure accident. This clarifies blk_mq_start_request() -- we should not get there with STARTING set -- this simplifies the code and makes the barrier usage sane (the old code could be read to allow not having _any_ atomic after the barrier, in which case the barrier hasn't got anything to order). We then also introduce the missing pairing barrier for it. Also down-grade the barrier to smp_wmb(), this is cheaper for PowerPC/ARM and doesn't cost anything extra on x86. And it documents the STARTING vs COMPLETE ordering. Although I've not been entirely successful in reverse engineering the blk-mq state machine so there might still be more funnies around timeout vs requeue. If I got anything wrong, feel free to educate me by adding comments to clarify things ;-) Cc: Alan Stern Cc: Will Deacon Cc: Ming Lei Cc: Christoph Hellwig Cc: Andrea Parri Cc: Boqun Feng Cc: Bart Van Assche Cc: "Paul E. McKenney" Fixes: 538b75341835 ("blk-mq: request deadline must be visible before marking rq as started") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- block/blk-mq.c | 52 ++++++++++++++++++++++++++++++++++++++++------------ block/blk-timeout.c | 2 +- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 7f01d69879d66..a7b46b754a7c8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -596,22 +596,32 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); - /* - * Ensure that ->deadline is visible before set the started - * flag and clear the completed flag. - */ - smp_mb__before_atomic(); + WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); /* * Mark us as started and clear complete. Complete might have been * set if requeue raced with timeout, which then marked it as * complete. So be sure to clear complete again when we start * the request, otherwise we'll ignore the completion event. + * + * Ensure that ->deadline is visible before we set STARTED, such that + * blk_mq_check_expired() is guaranteed to observe our ->deadline when + * it observes STARTED. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + smp_wmb(); + set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + /* + * Coherence order guarantees these consecutive stores to a + * single variable propagate in the specified order. Thus the + * clear_bit() is ordered _after_ the set bit. See + * blk_mq_check_expired(). + * + * (the bits must be part of the same byte for this to be + * true). + */ clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + } if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -781,10 +791,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; + unsigned long deadline; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; + /* + * Ensures that if we see STARTED we must also see our + * up-to-date deadline, see blk_mq_start_request(). + */ + smp_rmb(); + + deadline = READ_ONCE(rq->deadline); + /* * The rq being checked may have been freed and reallocated * out already here, we avoid this race by checking rq->deadline @@ -798,11 +817,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * and clearing the flag in blk_mq_start_request(), so * this rq won't be timed out too. */ - if (time_after_eq(jiffies, rq->deadline)) { - if (!blk_mark_rq_complete(rq)) + if (time_after_eq(jiffies, deadline)) { + if (!blk_mark_rq_complete(rq)) { + /* + * Again coherence order ensures that consecutive reads + * from the same variable must be in that order. This + * ensures that if we see COMPLETE clear, we must then + * see STARTED set and we'll ignore this timeout. + * + * (There's also the MB implied by the test_and_clear()) + */ blk_mq_rq_timed_out(rq, reserved); - } else if (!data->next_set || time_after(data->next, rq->deadline)) { - data->next = rq->deadline; + } + } else if (!data->next_set || time_after(data->next, deadline)) { + data->next = deadline; data->next_set = 1; } } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 17ec83bb09002..e3e9c9771d361 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -211,7 +211,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - req->deadline = jiffies + req->timeout; + WRITE_ONCE(req->deadline, jiffies + req->timeout); /* * Only the non-mq case needs to add the request to a protected list. -- cgit v1.2.3 From fc13457f74dcf054b0d17efb7b94b46fdf17f412 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Oct 2017 11:22:24 -0600 Subject: blk-mq: document the need to have STARTED and COMPLETED share a byte For memory ordering guarantees on stores, we need to ensure that these two bits share the same byte of storage in the unsigned long. Add a comment as to why, and a BUILD_BUG_ON() to ensure that we don't violate this requirement. Suggested-by: Boqun Feng Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++++ block/blk.h | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index a7b46b754a7c8..076cbab9c3e0f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2923,6 +2923,12 @@ EXPORT_SYMBOL_GPL(blk_mq_poll); static int __init blk_mq_init(void) { + /* + * See comment in block/blk.h rq_atomic_flags enum + */ + BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != + (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk.h b/block/blk.h index fda5a4632aba3..6ac43dfd68a7d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -122,8 +122,15 @@ void blk_account_io_done(struct request *req); * Internal atomic flags for request handling */ enum rq_atomic_flags { + /* + * Keep these two bits first - not because we depend on the + * value of them, but we do depend on them being in the same + * byte of storage to ensure ordering on writes. Keeping them + * first will achieve that nicely. + */ REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; -- cgit v1.2.3 From 85009b4f5f0399669a44f07cb9a5622c0e71d419 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 02:09:06 -0600 Subject: writeback: eliminate work item allocation in bd_start_writeback() Handle start-all writeback like we do periodic or kupdate style writeback - by marking the bdi_writeback as needing a full flush, and simply waking the thread. This eliminates the need to allocate and queue a specific work item just for this purpose. After this change, we truly only ever have one of them running at any point in time. We mark the need to start all flushes, and the writeback thread will clear it once it has processed the request. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 71 +++++++++++++++++++--------------------- include/linux/backing-dev-defs.h | 23 +++++++++++++ include/linux/writeback.h | 22 ------------- include/trace/events/writeback.h | 1 - 4 files changed, 57 insertions(+), 60 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 399619c975679..9e24d604c59c6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,7 +53,6 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ - unsigned int start_all:1; /* nr_pages == 0 (all) writeback */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -947,8 +946,6 @@ static unsigned long get_nr_dirty_pages(void) static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) { - struct wb_writeback_work *work; - if (!wb_has_dirty_io(wb)) return; @@ -958,35 +955,14 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) * high frequency, causing pointless allocations of tons of * work items and keeping the flusher threads busy retrieving * that work. Ensure that we only allow one of them pending and - * inflight at the time. It doesn't matter if we race a little - * bit on this, so use the faster separate test/set bit variants. + * inflight at the time. */ - if (test_bit(WB_start_all, &wb->state)) + if (test_bit(WB_start_all, &wb->state) || + test_and_set_bit(WB_start_all, &wb->state)) return; - set_bit(WB_start_all, &wb->state); - - /* - * This is WB_SYNC_NONE writeback, so if allocation fails just - * wakeup the thread for old dirty data writeback - */ - work = kzalloc(sizeof(*work), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (!work) { - clear_bit(WB_start_all, &wb->state); - trace_writeback_nowork(wb); - wb_wakeup(wb); - return; - } - - work->sync_mode = WB_SYNC_NONE; - work->nr_pages = wb_split_bdi_pages(wb, get_nr_dirty_pages()); - work->range_cyclic = 1; - work->reason = reason; - work->auto_free = 1; - work->start_all = 1; - - wb_queue_work(wb, work); + wb->start_all_reason = reason; + wb_wakeup(wb); } /** @@ -1838,14 +1814,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) list_del_init(&work->list); } spin_unlock_bh(&wb->work_lock); - - /* - * Once we start processing a work item that had !nr_pages, - * clear the wb state bit for that so we can allow more. - */ - if (work && work->start_all) - clear_bit(WB_start_all, &wb->state); - return work; } @@ -1901,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; } +static long wb_check_start_all(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_bit(WB_start_all, &wb->state)) + return 0; + + nr_pages = get_nr_dirty_pages(); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = wb_split_bdi_pages(wb, nr_pages), + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = wb->start_all_reason, + }; + + nr_pages = wb_writeback(wb, &work); + } + + clear_bit(WB_start_all, &wb->state); + return nr_pages; +} + + /* * Retrieve work items and do the writeback they describe */ @@ -1916,6 +1908,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) finish_writeback_work(wb, work); } + /* + * Check for a flush-everything request + */ + wrote += wb_check_start_all(wb); + /* * Check for periodic writeback, kupdated() style */ diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 420de5c7c7f9a..b7c7be6f5986a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -44,6 +44,28 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * why some writeback work was initiated + */ +enum wb_reason { + WB_REASON_BACKGROUND, + WB_REASON_VMSCAN, + WB_REASON_SYNC, + WB_REASON_PERIODIC, + WB_REASON_LAPTOP_TIMER, + WB_REASON_FREE_MORE_MEM, + WB_REASON_FS_FREE_SPACE, + /* + * There is no bdi forker thread any more and works are done + * by emergency worker, however, this is TPs userland visible + * and we'll be exposing exactly the same information, + * so it has a mismatch name. + */ + WB_REASON_FORKER_THREAD, + + WB_REASON_MAX, +}; + /* * For cgroup writeback, multiple wb's may map to the same blkcg. Those * wb's can operate mostly independently but should share the congested @@ -116,6 +138,7 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9c0091678af48..dd1d2c23f7431 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -41,28 +41,6 @@ enum writeback_sync_modes { WB_SYNC_ALL, /* Wait on every mapping */ }; -/* - * why some writeback work was initiated - */ -enum wb_reason { - WB_REASON_BACKGROUND, - WB_REASON_VMSCAN, - WB_REASON_SYNC, - WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, - WB_REASON_FREE_MORE_MEM, - WB_REASON_FS_FREE_SPACE, - /* - * There is no bdi forker thread any more and works are done - * by emergency worker, however, this is TPs userland visible - * and we'll be exposing exactly the same information, - * so it has a mismatch name. - */ - WB_REASON_FORKER_THREAD, - - WB_REASON_MAX, -}; - /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 9b57f014d79d0..19a0ea08e0989 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -286,7 +286,6 @@ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) -DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, -- cgit v1.2.3 From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 23:39:05 -0600 Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads This tunable has been obsolete since 2.6.32, and writes to the file have been failing and complaining in dmesg since then: nr_pdflush_threads exported in /proc is scheduled for removal That was 8 years ago. Remove the file ABI obsolete notice, and the sysfs file. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads | 5 ----- kernel/sysctl.c | 5 ----- 2 files changed, 10 deletions(-) delete mode 100644 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads deleted file mode 100644 index b0b0eeb20fe34..0000000000000 --- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads +++ /dev/null @@ -1,5 +0,0 @@ -What: /proc/sys/vm/nr_pdflush_threads -Date: June 2012 -Contact: Wanpeng Li -Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush - exported in /proc/sys/vm/ should be removed. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157f..a5dd8d82c2538 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = { .proc_handler = dirtytime_interval_handler, .extra1 = &zero, }, - { - .procname = "nr_pdflush_threads", - .mode = 0444 /* read-only */, - .proc_handler = pdflush_proc_obsolete, - }, { .procname = "swappiness", .data = &vm_swappiness, -- cgit v1.2.3 From 469d0ef06debe29ec60350b9976a520c4ad5a1e8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 26 Sep 2017 21:50:45 -0700 Subject: nvme-fc: move remote port get/put/free location move nvme_fc_rport_get/put and rport free to higher in the file to avoid adding prototypes to resolve references in upcoming code additions Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 78 +++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2dc9932e48182..b8e0822127c1a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -489,6 +489,45 @@ nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); } +static void +nvme_fc_free_rport(struct kref *ref) +{ + struct nvme_fc_rport *rport = + container_of(ref, struct nvme_fc_rport, ref); + struct nvme_fc_lport *lport = + localport_to_lport(rport->remoteport.localport); + unsigned long flags; + + WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&rport->ctrl_list)); + + /* remove from lport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&rport->endp_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + /* let the LLDD know we've finished tearing it down */ + lport->ops->remoteport_delete(&rport->remoteport); + + ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); + + kfree(rport); + + nvme_fc_lport_put(lport); +} + +static void +nvme_fc_rport_put(struct nvme_fc_rport *rport) +{ + kref_put(&rport->ref, nvme_fc_free_rport); +} + +static int +nvme_fc_rport_get(struct nvme_fc_rport *rport) +{ + return kref_get_unless_zero(&rport->ref); +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -568,45 +607,6 @@ out_reghost_failed: } EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); -static void -nvme_fc_free_rport(struct kref *ref) -{ - struct nvme_fc_rport *rport = - container_of(ref, struct nvme_fc_rport, ref); - struct nvme_fc_lport *lport = - localport_to_lport(rport->remoteport.localport); - unsigned long flags; - - WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); - WARN_ON(!list_empty(&rport->ctrl_list)); - - /* remove from lport list */ - spin_lock_irqsave(&nvme_fc_lock, flags); - list_del(&rport->endp_list); - spin_unlock_irqrestore(&nvme_fc_lock, flags); - - /* let the LLDD know we've finished tearing it down */ - lport->ops->remoteport_delete(&rport->remoteport); - - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); - - kfree(rport); - - nvme_fc_lport_put(lport); -} - -static void -nvme_fc_rport_put(struct nvme_fc_rport *rport) -{ - kref_put(&rport->ref, nvme_fc_free_rport); -} - -static int -nvme_fc_rport_get(struct nvme_fc_rport *rport) -{ - return kref_get_unless_zero(&rport->ref); -} - static int nvme_fc_abort_lsops(struct nvme_fc_rport *rport) { -- cgit v1.2.3 From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 21:22:52 +0200 Subject: block: remove QUEUE_FLAG_STACKABLE We already have a queue_is_rq_based helper to check if a request_queue is request based, so we can remove the flag for it. Acked-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/elevator.c | 2 +- drivers/md/dm-rq.c | 2 +- drivers/md/dm-table.c | 15 +-------------- drivers/md/dm.c | 11 ----------- include/linux/blkdev.h | 5 ----- 6 files changed, 3 insertions(+), 33 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 980e730956433..7f4a1ba532afc 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), - QUEUE_FLAG_NAME(STACKABLE), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(DISCARD), diff --git a/block/elevator.c b/block/elevator.c index 153926a909011..7ae50eb2732bb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1118,7 +1118,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!blk_queue_stackable(q)) + if (!queue_is_rq_based(q)) return sprintf(name, "none\n"); if (!q->elevator) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106fff..9d32f25489c27 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return blk_queue_stackable(md->queue); + return queue_is_rq_based(md->queue); } static void dm_old_start_queue(struct request_queue *q) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ef7b8f201f73a..75281828f2cbc 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1000,7 +1000,7 @@ verify_rq_based: list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - if (!blk_queue_stackable(q)) { + if (!queue_is_rq_based(q)) { DMERR("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5c..8d07ad61221c6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work); void dm_init_md_queue(struct mapped_device *md) { - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device may not have been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - /* * Initialize data that will only be used by a non-blk-mq DM queue * - must do so here (in alloc_dev callchain) before queue is used diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02fa42d24b52f..9fb71fc7d0e85 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ @@ -633,12 +632,10 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) @@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_stackable(q) \ - test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) -- cgit v1.2.3 From 775d3a35dc3e13de55ec0e061c59e36faa7dd7f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Oct 2017 08:15:15 -0600 Subject: backing-dev: kill unused pdflush_proc_obsolete() After commit b35bd0d9f8a8, pdflush_proc_obsolete() is no longer used. Kill the function and declaration. Reported-by: Rakesh Pandit Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 -- mm/backing-dev.c | 20 -------------------- 2 files changed, 22 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 157e950a70dc1..872afa41abc2d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -172,8 +172,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e19606bb41a0b..74b52dfd5852d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1072,23 +1072,3 @@ out: return ret; } EXPORT_SYMBOL(wait_iff_congested); - -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - char kbuf[] = "0\n"; - - if (*ppos || *lenp < sizeof(kbuf)) { - *lenp = 0; - return 0; - } - - if (copy_to_user(buffer, kbuf, sizeof(kbuf))) - return -EFAULT; - pr_warn_once("%s exported in /proc is scheduled for removal\n", - table->procname); - - *lenp = 2; - *ppos += *lenp; - return 2; -} -- cgit v1.2.3 From 4b14a5c5d57f4fd6929db3427ba4d7c3775b4680 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Thu, 5 Oct 2017 14:09:20 -0400 Subject: block: remove unnecessary NULL checks in bioset_integrity_free() mempool_destroy() already checks for a NULL value being passed in, this eliminates duplicate checks. This was caught by running make coccicheck M=block/ on linus' tree on commit 77ede3a014a32746002f7889211f0cecf4803163 (current head as of this patch). Reviewed-by: Kyle Fortin Acked-by: Martin K. Petersen Signed-off-by: Tim Hansen Signed-off-by: Jens Axboe --- block/bio-integrity.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5df32907ff3bf..23b42e8aa03e7 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); + mempool_destroy(bs->bio_integrity_pool); + mempool_destroy(bs->bvec_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); -- cgit v1.2.3 From 4078def82f352cf5007691635da290a109511bc5 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Fri, 6 Oct 2017 14:45:13 -0400 Subject: block/bio: Remove null checks before mempool_destroy in bioset_free This patch removes redundant checks for null values on bio_pool and bvec_pool. Found using make coccicheck M=block/ on linux-net tree on the next-20170929 tag. Signed-off-by: Tim Hansen Signed-off-by: Jens Axboe --- block/bio.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8338304ea2563..bf0dbe8f78f8e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1928,11 +1928,8 @@ void bioset_free(struct bio_set *bs) if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); + mempool_destroy(bs->bio_pool); + mempool_destroy(bs->bvec_pool); bioset_integrity_free(bs); bio_put_slab(bs); -- cgit v1.2.3 From 94af584692091347baea4d810b9fc6e0f5483d42 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 10 Oct 2017 05:44:13 +0800 Subject: writeback: schedule periodic writeback with sysctl After disable periodic writeback by writing 0 to dirty_writeback_centisecs, the handler wb_workfn() will not be entered again until the dirty background limit reaches or sync syscall is executed or no enough free memory available or vmscan is triggered. So the periodic writeback can't be enabled by writing a non-zero value to dirty_writeback_centisecs. As it can be disabled by sysctl, it should be able to enable by sysctl as well. Reviewed-by: Jan Kara Signed-off-by: Yafang Shao Signed-off-by: Jens Axboe --- mm/page-writeback.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8d1fc593bce8f..622a18c114caa 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1972,8 +1972,14 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); - return 0; + unsigned int old_interval = dirty_writeback_interval; + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + if (!ret && !old_interval && dirty_writeback_interval) + wakeup_flusher_threads(WB_REASON_PERIODIC); + + return ret; } #ifdef CONFIG_BLOCK -- cgit v1.2.3 From b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Mon, 9 Oct 2017 16:27:21 +0200 Subject: block,bfq: Disable writeback throttling Similarly to CFQ, BFQ has its write-throttling heuristics, and it is better not to combine them with further write-throttling heuristics of a different nature. So this commit disables write-back throttling for a device if BFQ is used as I/O scheduler for that device. Signed-off-by: Luca Miccio Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 ++- block/blk-wbt.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 70f9177c4f5b2..261f98695910f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -108,6 +108,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" +#include "blk-wbt.h" #define BFQ_BFQQ_FNS(name) \ void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ @@ -4810,7 +4811,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - + wbt_disable_default(q); return 0; out_free: diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6a9a0f03a67bd..e59d59c11ebbb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) } /* - * Disable wbt, if enabled by default. Only called from CFQ. + * Disable wbt, if enabled by default. */ void wbt_disable_default(struct request_queue *q) { -- cgit v1.2.3 From 99fead8d38e5302b1be9403d4de815ce9174a3df Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 9 Oct 2017 13:11:23 +0200 Subject: block, bfq: fix unbalanced decrements of burst size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit "block, bfq: decrease burst size when queues in burst exit" introduced the decrement of burst_size on the removal of a bfq_queue from the burst list. Unfortunately, this decrement can happen to be performed even when burst size is already equal to 0, because of unbalanced decrements. A description follows of the cause of these unbalanced decrements, namely a wrong assumption, and of the way how this wrong assumption leads to unbalanced decrements. The wrong assumption is that a bfq_queue can exit only if the process associated with the bfq_queue has exited. This is false, because a bfq_queue, say Q, may exit also as a consequence of a merge with another bfq_queue. In this case, Q exits because the I/O of its associated process has been redirected to another bfq_queue. The decrement unbalance occurs because Q may then be re-created after a split, and added back to the current burst list, *without* incrementing burst_size. burst_size is not incremented because Q is not a new bfq_queue added to the burst list, but a bfq_queue only temporarily removed from the list, and, before the commit "bfq-sq, bfq-mq: decrease burst size when queues in burst exit", burst_size was not decremented when Q was removed. This commit addresses this issue by just checking whether the exiting bfq_queue is a merged bfq_queue, and, in that case, not decrementing burst_size. Unfortunately, this still leaves room for unbalanced decrements, in the following rarer case: on a split, the bfq_queue happens to be inserted into a different burst list than that it was removed from when merged. If this happens, the number of elements in the new burst list becomes higher than burst_size (by one). When the bfq_queue then exits, it is of course not in a merged state any longer, thus burst_size is decremented, which results in an unbalanced decrement. To handle this sporadic, unlucky case in a simple way, this commit also checks that burst_size is larger than 0 before decrementing it. Finally, this commit removes an useless, extra check: the check that the bfq_queue is sync, performed before checking whether the bfq_queue is in the burst list. This extra check is redundant, because only sync bfq_queues can be inserted into the burst list. Fixes: 7cb04004fa37 ("block, bfq: decrease burst size when queues in burst exit") Reported-by: Philip Müller Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Philip Müller Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 261f98695910f..889a8549d97f3 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3726,9 +3726,36 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->ref) return; - if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + if (!hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); - bfqq->bfqd->burst_size--; + /* + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). + */ + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; } kmem_cache_free(bfq_pool, bfqq); @@ -4460,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, else { bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } -- cgit v1.2.3 From 8264c3214f28b52b399d9e03bfa7feec275a0d71 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Mon, 9 Oct 2017 13:34:41 +0300 Subject: writeback: merge try_to_writeback_inodes_sb_nr() into caller Since commit 925a6efb8ff0c ("Btrfs: stop using try_to_writeback_inodes_sb_nr to flush delalloc") this function hasn't been used outside so stop exporting it. In addition we merge it into try_to_writeback_inodes_sb() which is the only caller. Also change return type of try_to_writeback_inodes_sb to void as the only user ext4 doesn't care. Reviewed-by: Jan Kara Signed-off-by: Rakesh Pandit Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 30 ++++++------------------------ include/linux/writeback.h | 4 +--- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9e24d604c59c6..08f5debd07d10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2376,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb); /** - * try_to_writeback_inodes_sb_nr - try to start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock - * @nr: the number of pages to write - * @reason: the reason of writeback + * @reason: reason why some writeback work was initiated * - * Invoke writeback_inodes_sb_nr if no writeback is currently underway. - * Returns 1 if writeback was started, 0 if not. + * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ -bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, - enum wb_reason reason) +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) - return false; + return; - __writeback_inodes_sb_nr(sb, nr, reason, true); + __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); - return true; -} -EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); - -/** - * try_to_writeback_inodes_sb - try to start writeback if none underway - * @sb: the superblock - * @reason: reason why some writeback work was initiated - * - * Implement by try_to_writeback_inodes_sb_nr() - * Returns 1 if writeback was started, 0 if not. - */ -bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) -{ - return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index dd1d2c23f7431..e15ec14085ade 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -163,9 +163,7 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, -- cgit v1.2.3 From 58a9edce0aa912640abe47d3fc039e6230ef848b Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Tue, 10 Oct 2017 22:53:46 +0800 Subject: blkcg: check pol->cpd_free_fn before free cpd check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd. Reviewed-by: Johannes Thumshirn Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d3f56baee9366..e7ec676043b1c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1452,7 +1452,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); @@ -1492,7 +1492,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_alloc_fn) { + if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); -- cgit v1.2.3 From 53cfdc10a95d03fbc82970d682a32696d19ef886 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 10 Oct 2017 11:13:32 +0800 Subject: blk-throttle: fix null pointer dereference while throttling writeback IOs A null pointer dereference can occur when blkcg is removed manually with writeback IOs inflight. This is caused by the following case: Writeback kworker submit the bio and set bio->bi_cg_private to tg in blk_throtl_assoc_bio. Then we remove the block cgroup manually, the blkg and tg would be freed if there is no request inflight. When the submitted bio come back, blk_throtl_bio_endio() fetch the tg which was already freed. Fix this by increasing the refcount of blkg in funcion blk_throtl_assoc_bio() so that the blkg will not be freed until the bio_endio called. Reviewed-by: Shaohua Li Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- block/blk-throttle.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0fea76aa0f3ff..fe49c465ec860 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2112,8 +2112,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (bio->bi_css) + if (bio->bi_css) { + if (bio->bi_cg_private) + blkg_put(tg_to_blkg(bio->bi_cg_private)); bio->bi_cg_private = tg; + blkg_get(tg_to_blkg(tg)); + } blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); #endif } @@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio) start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; finish_time = __blk_stat_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) + if (!start_time || finish_time <= start_time) { + blkg_put(tg_to_blkg(tg)); return; + } lat = finish_time - start_time; /* this is only for bio based driver */ @@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio) tg->bio_cnt /= 2; tg->bad_bio_cnt /= 2; } + + blkg_put(tg_to_blkg(tg)); } #endif -- cgit v1.2.3 From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:55:59 -0700 Subject: blk-stat: delete useless code Fix two issues: - the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except sum it to global stat. We can do the calculation there. The flush just wastes cpu time. - some fields are signed int/s64. I don't see the point. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-stat.c | 45 +++++++-------------------------------------- include/linux/blk_types.h | 5 ++--- 2 files changed, 9 insertions(+), 41 deletions(-) diff --git a/block/blk-stat.c b/block/blk-stat.c index c52356d90fe38..3a2f3c96f3672 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -11,8 +11,6 @@ #include "blk-mq.h" #include "blk.h" -#define BLK_RQ_STAT_BATCH 64 - struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; @@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; - stat->batch = stat->nr_batch = 0; -} - -static void blk_stat_flush_batch(struct blk_rq_stat *stat) -{ - const s32 nr_batch = READ_ONCE(stat->nr_batch); - const s32 nr_samples = READ_ONCE(stat->nr_samples); - - if (!nr_batch) - return; - if (!nr_samples) - stat->mean = div64_s64(stat->batch, nr_batch); - else { - stat->mean = div64_s64((stat->mean * nr_samples) + - stat->batch, - nr_batch + nr_samples); - } - - stat->nr_samples += nr_batch; - stat->nr_batch = stat->batch = 0; + stat->batch = 0; } +/* src is a per-cpu stat, mean isn't initialized */ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - blk_stat_flush_batch(src); - if (!src->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); - if (!dst->nr_samples) - dst->mean = src->mean; - else { - dst->mean = div64_s64((src->mean * src->nr_samples) + - (dst->mean * dst->nr_samples), - dst->nr_samples + src->nr_samples); - } + dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, + dst->nr_samples + src->nr_samples); + dst->nr_samples += src->nr_samples; } @@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); - - if (stat->batch + value < stat->batch || - stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) - blk_stat_flush_batch(stat); - stat->batch += value; - stat->nr_batch++; + stat->nr_samples++; } void blk_stat_add(struct request *rq) @@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq) struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket; - s64 now, value; + u64 now, value; now = __blk_stat_time(ktime_to_ns(ktime_get())); if (now < blk_stat_time(&rq->issue_stat)) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a2d2aa709cef4..3385c89f402e2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_rq_stat { - s64 mean; + u64 mean; u64 min; u64 max; - s32 nr_samples; - s32 nr_batch; + u32 nr_samples; u64 batch; }; -- cgit v1.2.3 From 85acb3ba2f925a0ec6928c1967c3adefa00682f4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:56:00 -0700 Subject: block: set request_list for request Legacy queue sets request's request_list, mq doesn't. This makes mq does the same thing, so we can find cgroup of a request. Note, we really only use blkg field of request_list, it's pointless to allocate mempool for request_list in mq case. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 14f7674fa0b14..e8e149ccc86b7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data) int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { - if (unlikely(rl->rq_pool)) + if (unlikely(rl->rq_pool) || q->mq_ops) return 0; rl->q = q; diff --git a/block/blk-mq.c b/block/blk-mq.c index 076cbab9c3e0f..40cba1b1978f2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq) wbt_done(q->rq_wb, &rq->issue_stat); + if (blk_rq_rl(rq)) + blk_put_rl(blk_rq_rl(rq)); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); if (rq->tag != -1) @@ -1532,6 +1535,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); + blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); + blk_account_io_start(rq, true); } -- cgit v1.2.3 From 7f66721a7d5bf6251f9c49aada9200c61ddcecc5 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Thu, 12 Oct 2017 19:58:10 +0300 Subject: fs/block_dev: remove vfs_msg() interface Replaced by pr_err usage in commit ef51042472f5 ("block, dax: move "select DAX" from BLOCK to FS_DAX") Signed-off-by: Rakesh Pandit Acked-by: Ross Zwisler Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 ------------ include/linux/blkdev.h | 11 ----------- 2 files changed, 23 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 93d088ffc05c6..07ddccd178017 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); - va_end(args); -} - static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9fb71fc7d0e85..72637028f3c9f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -917,17 +917,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) } #endif -#ifdef CONFIG_PRINTK -#define vfs_msg(sb, level, fmt, ...) \ - __vfs_msg(sb, level, fmt, ##__VA_ARGS__) -#else -#define vfs_msg(sb, level, fmt, ...) \ -do { \ - no_printk(fmt, ##__VA_ARGS__); \ - __vfs_msg(sb, "", " "); \ -} while (0) -#endif - extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); -- cgit v1.2.3 From 47bc227deedbcf3ac214a2d922c28dfa5e403f09 Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Thu, 12 Oct 2017 23:15:54 +0100 Subject: mtip32xx: Clean up unused variables Remove variables that are set but never used. Signed-off-by: Christos Gkekas Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4a3cfc7940dec..b8af7352a18f5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) static bool mtip_pause_ncq(struct mtip_port *port, struct host_to_dev_fis *fis) { - struct host_to_dev_fis *reply; unsigned long task_file_data; - reply = port->rxfis + RX_FIS_D2H_REG; task_file_data = readl(port->mmio+PORT_TFDATA); - if ((task_file_data & 1)) return false; @@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, .opts = opts }; int rv = 0; - unsigned long start; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* Copy the command to the command table */ memcpy(int_cmd->command, fis, fis_len*4); - start = jiffies; rq->timeout = timeout; /* insert request and run queue */ @@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd) { int i; int rv; - unsigned int num_command_slots; unsigned long timeout, timetaken; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd) rv = -EIO; goto out1; } - num_command_slots = dd->slot_groups * 32; hba_setup(dd); -- cgit v1.2.3 From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:50 +0200 Subject: lightnvm: prevent target type module removal when in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If target type module e.g. pblk here is unloaded (rmmod) while module is in use (after creating target) system crashes. We fix this by using module API refcnt. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 4 ++++ drivers/lightnvm/pblk-init.c | 1 + include/linux/lightnvm.h | 1 + 3 files changed, 6 insertions(+) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ddae430b6eae8..60e163be5a89a 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -316,6 +317,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) list_add_tail(&t->list, &dev->targets); mutex_unlock(&dev->mlock); + __module_get(tt->owner); + return 0; err_sysfs: if (tt->exit) @@ -351,6 +354,7 @@ static void __nvm_remove_target(struct nvm_target *t) nvm_remove_tgt_dev(t->dev, 1); put_disk(tdisk); + module_put(t->type->owner); list_del(&t->list); kfree(t); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 1b0f61233c216..6df65d14a2c52 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -1044,6 +1044,7 @@ static struct nvm_tgt_type tt_pblk = { .sysfs_init = pblk_sysfs_init, .sysfs_exit = pblk_sysfs_exit, + .owner = THIS_MODULE, }; static int __init pblk_module_init(void) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7dfa56ebbc6d0..7b80ac911d26e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -460,6 +460,7 @@ struct nvm_tgt_type { /* For internal use */ struct list_head list; + struct module *owner; }; extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); -- cgit v1.2.3 From bb6aa6f08268bbce4e0185b18cab9e04505d6695 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:51 +0200 Subject: lightnvm: prevent bd removal if busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a virtual block device is formatted and mounted after creating with "nvme lnvm create... -t pblk", a removal from "nvm lnvm remove" would result in this: 446416.309757] bdi-block not registered [446416.309773] ------------[ cut here ]------------ [446416.309780] WARNING: CPU: 3 PID: 4319 at fs/fs-writeback.c:2159 __mark_inode_dirty+0x268/0x340 Ideally removal should return -EBUSY as block device is mounted after formatting. This patch tries to address this checking if whole device or any partition of it already mounted or not before removal. Whole device is checked using "bd_super" member of block device. This member is always set once block device has been mounted using a filesystem. Another member "bd_part_count" takes care of checking any if any partitions are under use. "bd_part_count" is only updated under locks when partitions are opened or closed (first open and last release). This at least does take care sending -EBUSY if removal is being attempted while whole block device or any partition is mounted. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 60e163be5a89a..c490711cf0f4a 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -373,6 +373,7 @@ static void __nvm_remove_target(struct nvm_target *t) static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) { struct nvm_target *t; + struct block_device *bdev; mutex_lock(&dev->mlock); t = nvm_find_target(dev, remove->tgtname); @@ -380,6 +381,19 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) mutex_unlock(&dev->mlock); return 1; } + bdev = bdget_disk(t->disk, 0); + if (!bdev) { + pr_err("nvm: removal failed, allocating bd failed\n"); + mutex_unlock(&dev->mlock); + return -ENOMEM; + } + if (bdev->bd_super || bdev->bd_part_count) { + pr_err("nvm: removal failed, block device busy\n"); + bdput(bdev); + mutex_unlock(&dev->mlock); + return -EBUSY; + } + bdput(bdev); __nvm_remove_target(t); mutex_unlock(&dev->mlock); -- cgit v1.2.3 From 88d31ea2676696ad0802a361c8b824f0762fa34c Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:52 +0200 Subject: lightnvm: protect target type list with correct locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvm_tgt_types list was protected by wrong lock for NVM_INFO ioctl call and can race with addition or removal of target types. Also unregistering target type was not protected correctly. Fixes: 5cd907853 ("lightnvm: remove nested lock conflict with mm") Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index c490711cf0f4a..ee2b6d7719901 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -589,9 +589,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) if (!tt) return; - down_write(&nvm_lock); + down_write(&nvm_tgtt_lock); list_del(&tt->list); - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); } EXPORT_SYMBOL(nvm_unregister_tgt_type); @@ -1195,7 +1195,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) info->version[1] = NVM_VERSION_MINOR; info->version[2] = NVM_VERSION_PATCH; - down_write(&nvm_lock); + down_write(&nvm_tgtt_lock); list_for_each_entry(tt, &nvm_tgt_types, list) { struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; @@ -1208,7 +1208,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) } info->tgtsize = tgt_iter; - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { kfree(info); -- cgit v1.2.3 From a96d50fa0c8d3e399c49a0a90ddbbaabf8a46bb3 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:53 +0200 Subject: lightnvm: remove already calculated nr_chnls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove repeated calculation for number of channels while creating a target device. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ee2b6d7719901..798964f511cd0 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -139,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, int prev_nr_luns; int i, j; - nr_chnls = nr_luns / dev->geo.luns_per_chnl; nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); -- cgit v1.2.3 From c9d84b350f9b253872fc24f4dcfea166c884ee15 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:54 +0200 Subject: lightnvm: pblk: fix error path in pblk_lines_alloc_metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use appropriate memory free calls based on allocation type used and also fix number of times free is called if kmalloc fails. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 6df65d14a2c52..05665a7e648ca 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -630,7 +630,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk) fail_free_emeta: while (--i >= 0) { - vfree(l_mg->eline_meta[i]->buf); + if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META) + vfree(l_mg->eline_meta[i]->buf); + else + kfree(l_mg->eline_meta[i]->buf); kfree(l_mg->eline_meta[i]); } -- cgit v1.2.3 From 32c662c58a9b9d0c99e713a14ca323a9a91c73a0 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:55 +0200 Subject: lightnvm: include NVM Express driver if OCSSD is selected for build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because NVM needs BLK_DEV_NVME, select it automatically if we mark NVM in config file before building kernel. Also append PCI to depends as select doesn't automatically add dependencies. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index ead61a93cb4eb..2a953efec4e19 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,8 @@ menuconfig NVM bool "Open-Channel SSD target support" - depends on BLOCK && HAS_DMA + depends on BLOCK && HAS_DMA && PCI + select BLK_DEV_NVME help Say Y here to get to enable Open-channel SSDs. -- cgit v1.2.3 From e57903fd972a398b7140d0bc055714e13a0e58c5 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:56 +0200 Subject: lightnvm: pblk: protect line bitmap while submitting meta io MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems pblk_dealloc_page would race against pblk_alloc_pages for line bitmap for sector allocation.The chances are very low but might as well protect the bitmap properly. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 81501644fb158..b53bb00a99188 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -486,12 +486,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) u64 addr; int i; + spin_lock(&line->lock); addr = find_next_zero_bit(line->map_bitmap, pblk->lm.sec_per_line, line->cur_sec); line->cur_sec = addr - nr_secs; for (i = 0; i < nr_secs; i++, line->cur_sec--) WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); + spin_unlock(&line->lock); } u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -- cgit v1.2.3 From 4e76af53e132bff9e2b94f018457fadabf5ab419 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:57 +0200 Subject: lightnvm: pblk: fix message if L2P MAP is in device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This usually happens if we are developing with qemu and ll2pmode has default value. Improve description. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 05665a7e648ca..8c85779e9635e 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -914,7 +914,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, int ret; if (dev->identity.dom & NVM_RSP_L2P) { - pr_err("pblk: device-side L2P table not supported. (%x)\n", + pr_err("pblk: host-side L2P table not supported. (%x)\n", dev->identity.dom); return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From c5493845b7b303315118fb4ab96654bf7cb897f0 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:58 +0200 Subject: lightnvm: pblk: improve error message if down_timeout fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two pr_err messages are useless as they don't differentiate error code. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b53bb00a99188..027c42bb1ab99 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1691,16 +1691,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, #endif ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); - if (ret) { - switch (ret) { - case -ETIME: - pr_err("pblk: lun semaphore timed out\n"); - break; - case -EINTR: - pr_err("pblk: lun semaphore timed out\n"); - break; - } - } + if (ret == -ETIME || ret == -EINTR) + pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); } void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) -- cgit v1.2.3 From c79819bc0877e4cbed8013b1abc9697e8805b21b Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:59 +0200 Subject: lightnvm: pblk: print incompatible line version correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct it by converting little endian to cpu endian and also define a macro for line version so that maintenance is easy. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 +- drivers/lightnvm/pblk-recovery.c | 4 ++-- drivers/lightnvm/pblk.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 027c42bb1ab99..8536d38ef97e3 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -978,7 +978,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); smeta_buf->header.id = cpu_to_le32(line->id); smeta_buf->header.type = cpu_to_le16(line->type); - smeta_buf->header.version = cpu_to_le16(1); + smeta_buf->header.version = SMETA_VERSION; /* Start metadata */ smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index cb556e06673e7..caf1242795757 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -900,9 +900,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) continue; - if (le16_to_cpu(smeta_buf->header.version) != 1) { + if (smeta_buf->header.version != SMETA_VERSION) { pr_err("pblk: found incompatible line version %u\n", - smeta_buf->header.version); + le16_to_cpu(smeta_buf->header.version)); return ERR_PTR(-EINVAL); } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 67e623bd5c2df..9ece409993fec 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -310,6 +310,7 @@ enum { }; #define PBLK_MAGIC 0x70626c6b /*pblk*/ +#define SMETA_VERSION cpu_to_le16(1) struct line_header { __le32 crc; -- cgit v1.2.3 From 32825ebb06fafeff463ed23e9d0dea459ebd30fe Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:00 +0200 Subject: lightnvm: pblk: reuse pblk_gc_should_kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a trivial change which reuses pblk_gc_should_kick instead of repeating it again in pblk_rl_free_lines_inc. Signed-off-by: Rakesh Pandit Made it apply to the common case. Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 -- drivers/lightnvm/pblk-rl.c | 33 +++++++++------------------------ drivers/lightnvm/pblk.h | 1 - 3 files changed, 9 insertions(+), 27 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 8536d38ef97e3..a68c6ae536e59 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1591,8 +1591,6 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); - - pblk_gc_should_kick(pblk); } void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 2e6a5361baf0e..9565c3bc4d0bb 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) * * Only the total number of free blocks is used to configure the rate limiter. */ -static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) +static void pblk_rl_update_rates(struct pblk_rl *rl) { + struct pblk *pblk = container_of(rl, struct pblk, rl); unsigned long free_blocks = pblk_rl_nr_free_blks(rl); + int max = rl->rb_budget; if (free_blocks >= rl->high) { rl->rb_user_max = max; @@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) rl->rb_state = PBLK_RL_LOW; } - return rl->rb_state; + if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW)) + pblk_gc_should_start(pblk); + else + pblk_gc_should_stop(pblk); } void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) { - struct pblk *pblk = container_of(rl, struct pblk, rl); int blk_in_line = atomic_read(&line->blk_in_line); - int ret; atomic_add(blk_in_line, &rl->free_blocks); - /* Rates will not change that often - no need to lock update */ - ret = pblk_rl_update_rates(rl, rl->rb_budget); - - if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); + pblk_rl_update_rates(rl); } void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) @@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) int blk_in_line = atomic_read(&line->blk_in_line); atomic_sub(blk_in_line, &rl->free_blocks); -} - -void pblk_gc_should_kick(struct pblk *pblk) -{ - struct pblk_rl *rl = &pblk->rl; - int ret; - - /* Rates will not change that often - no need to lock update */ - ret = pblk_rl_update_rates(rl, rl->rb_budget); - if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); + pblk_rl_update_rates(rl); } int pblk_rl_high_thrs(struct pblk_rl *rl) diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9ece409993fec..3a07c5b61a0cd 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -824,7 +824,6 @@ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); -void pblk_gc_should_kick(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); -- cgit v1.2.3 From a1121176ff757e3c073490a69608ea0b18a00ec1 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:01 +0200 Subject: lightnvm: pblk: initialize debug stat counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize the stat counter for garbage collected reads. Fixes: a4bd217b43268 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 8c85779e9635e..83445115a922e 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -947,6 +947,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_gc_writes, 0); + atomic_long_set(&pblk->recov_gc_reads, 0); #endif atomic_long_set(&pblk->read_failed, 0); -- cgit v1.2.3 From 7d327a9ed6c4dca341ebf99012e0a6b80a3050e6 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:02 +0200 Subject: lightnvm: pblk: use right flag for GC allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The data buffer for the GC path allocates virtual memory through vmalloc. When this change was introduced, a flag signaling kmalloc'ed memory was wrongly introduced. Use the right flag when creating a bio from this buffer. Fixes: de54e703a422 ("lightnvm: pblk: use vmalloc for GC data buffer") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d682e89e64935..ee8efb55b3307 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -499,7 +499,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, data_len = (*secs_to_gc) * geo->sec_size; bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, - PBLK_KMALLOC_META, GFP_KERNEL); + PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); goto err_free_dma; @@ -519,7 +519,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, if (ret) { bio_endio(bio); pr_err("pblk: GC read request failed\n"); - goto err_free_dma; + goto err_free_bio; } if (!wait_for_completion_io_timeout(&wait, @@ -541,10 +541,13 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); #endif + bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_OK; +err_free_bio: + bio_put(bio); err_free_dma: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_ERR; -- cgit v1.2.3 From cd8ddbf7a5e206fe6995ab0aee245d597dd6a7f2 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:03 +0200 Subject: lightnvm: pblk: free padded entries in write buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a REQ_FLUSH reaches pblk, the bio cannot be directly completed. Instead, data on the write buffer is flushed and the bio is completed on the completion pah. This might require some sectors to be padded in order to guarantee a successful write. This patch fixes a memory leak on the padded pages. A consequence of this bad free was that internal bios not containing data (only a flush) were not being completed. Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 - drivers/lightnvm/pblk-write.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index a68c6ae536e59..9299a5a75a187 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -190,7 +190,6 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, WARN_ON(off + nr_pages != bio->bi_vcnt); - bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE); for (i = off; i < nr_pages + off; i++) { bv = bio->bi_io_vec[i]; mempool_free(bv.bv_page, pblk->page_pool); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 3ad9e56d24734..d89ac573f8d83 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -33,6 +33,10 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, bio_endio(original_bio); } + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, + c_ctx->nr_padded); + #ifdef CONFIG_NVM_DEBUG atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); #endif @@ -521,7 +525,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) struct bio *bio = rqd->bio; if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); + pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, + c_ctx->nr_padded); } static int pblk_submit_write(struct pblk *pblk) -- cgit v1.2.3 From e0e12a707f02fcde1b77a2417d9fb0ae1ce3b003 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:04 +0200 Subject: lightnvm: pblk: fix write I/O sync stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix stat counter to collect the right number of I/Os being synced on the completion path. Fixes: 0880a9aa2d91f ("lightnvm: pblk: delete redundant buffer pointer") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index d89ac573f8d83..d82ca8bd83908 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, c_ctx->nr_padded); #ifdef CONFIG_NVM_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); + atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); #endif ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); -- cgit v1.2.3 From da67e68fb9d37fb9072b20cc75d4337a73bc01b4 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:05 +0200 Subject: lightnvm: pblk: avoid deadlock on low LUN config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On low LUN configurations, make sure not to send bios that are bigger than the buffer size. Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- drivers/lightnvm/pblk-rl.c | 6 ++++++ drivers/lightnvm/pblk.h | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 83445115a922e..eee4eeb47d077 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * user I/Os. Unless stalled, the rate limiter leaves at least 256KB * available for user I/O. */ - if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) + if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) blk_queue_split(q, &bio); return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 9565c3bc4d0bb..0896439a91b0f 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -163,6 +163,11 @@ int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) return rl->rb_user_max; } +int pblk_rl_max_io(struct pblk_rl *rl) +{ + return rl->rb_max_io; +} + static void pblk_rl_u_timer(unsigned long data) { struct pblk_rl *rl = (struct pblk_rl *)data; @@ -199,6 +204,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; + rl->rb_max_io = budget >> 1; rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 3a07c5b61a0cd..b592e5194b0f2 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -267,6 +267,7 @@ struct pblk_rl { int rb_gc_max; /* Max buffer entries available for GC I/O */ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ int rb_state; /* Rate-limiter current state */ + int rb_max_io; /* Maximum size for an I/O giving the config */ atomic_t rb_user_cnt; /* User I/O buffer counter */ atomic_t rb_gc_cnt; /* GC I/O buffer counter */ @@ -844,6 +845,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); +int pblk_rl_max_io(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); -- cgit v1.2.3 From bd432417681a224d9fa4a9d43be7d4edc82135b2 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:06 +0200 Subject: lightnvm: pblk: fix min size for page mempool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk uses an internal page mempool for allocating pages on internal bios. The main two users of this memory pool are partial reads (reads with some sectors in cache and some on media) and padded writes, which need to add dummy pages to an existing bio already containing valid data (and with a large enough bioset allocated). In both cases, the maximum number of pages per bio is defined by the maximum number of physical sectors supported by the underlying device. This patch fixes a bad mempool allocation, where the min_nr of elements on the pool was fixed (to 16), which is lower than the maximum number of sectors supported by NVMe (as of the time for this patch). Instead, use the maximum number of allowed sectors reported by the device. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 6 +++--- drivers/lightnvm/pblk-init.c | 15 ++++++++------- drivers/lightnvm/pblk-read.c | 2 +- drivers/lightnvm/pblk.h | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 9299a5a75a187..f5fbb9a46784c 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -192,7 +192,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, for (i = off; i < nr_pages + off; i++) { bv = bio->bi_io_vec[i]; - mempool_free(bv.bv_page, pblk->page_pool); + mempool_free(bv.bv_page, pblk->page_bio_pool); } } @@ -204,14 +204,14 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int i, ret; for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(pblk->page_pool, flags); + page = mempool_alloc(pblk->page_bio_pool, flags); if (!page) goto err; ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { pr_err("pblk: could not add page to bio\n"); - mempool_free(page, pblk->page_pool); + mempool_free(page, pblk->page_bio_pool); goto err; } } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index eee4eeb47d077..7b1f29c713381 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -132,7 +132,6 @@ static int pblk_rwb_init(struct pblk *pblk) } /* Minimum pages needed within a lun */ -#define PAGE_POOL_SIZE 16 #define ADDR_POOL_SIZE 64 static int pblk_set_ppaf(struct pblk *pblk) @@ -247,14 +246,16 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_init_global_caches(pblk)) return -ENOMEM; - pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); - if (!pblk->page_pool) + /* internal bios can be at most the sectors signaled by the device. */ + pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), + 0); + if (!pblk->page_bio_pool) return -ENOMEM; pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, pblk_blk_ws_cache); if (!pblk->line_ws_pool) - goto free_page_pool; + goto free_page_bio_pool; pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); if (!pblk->rec_pool) @@ -309,8 +310,8 @@ free_rec_pool: mempool_destroy(pblk->rec_pool); free_blk_ws_pool: mempool_destroy(pblk->line_ws_pool); -free_page_pool: - mempool_destroy(pblk->page_pool); +free_page_bio_pool: + mempool_destroy(pblk->page_bio_pool); return -ENOMEM; } @@ -322,7 +323,7 @@ static void pblk_core_free(struct pblk *pblk) if (pblk->bb_wq) destroy_workqueue(pblk->bb_wq); - mempool_destroy(pblk->page_pool); + mempool_destroy(pblk->page_bio_pool); mempool_destroy(pblk->line_ws_pool); mempool_destroy(pblk->rec_pool); mempool_destroy(pblk->g_rq_pool); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index ee8efb55b3307..402c732f0970a 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -238,7 +238,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, kunmap_atomic(src_p); kunmap_atomic(dst_p); - mempool_free(src_bv.bv_page, pblk->page_pool); + mempool_free(src_bv.bv_page, pblk->page_bio_pool); hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); } while (hole < nr_secs); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index b592e5194b0f2..229f6020ad8ad 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -620,7 +620,7 @@ struct pblk { struct list_head compl_list; - mempool_t *page_pool; + mempool_t *page_bio_pool; mempool_t *line_ws_pool; mempool_t *rec_pool; mempool_t *g_rq_pool; -- cgit v1.2.3 From b84ae4a8b883b96b95fff0e3979ff2c65bbf96b0 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:07 +0200 Subject: lightnvm: pblk: simplify work_queue mempool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In pblk, we have a mempool to allocate a generic structure that we pass along workqueues. This is heavily used in the GC path in order to have enough inflight reads and fully utilize the GC bandwidth. However, the current GC path copies data to the host memory and puts it back into the write buffer. This requires a vmalloc allocation for the data and a memory copy. Thus, guaranteeing the allocation by using a mempool for the structure in itself does not give us much. Until we implement support for vector copy to avoid moving data through the host, just allocate the workqueue structure using kmalloc. This allows us to have a much smaller mempool. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 13 +++++++------ drivers/lightnvm/pblk-gc.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-init.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-write.c | 4 ++-- drivers/lightnvm/pblk.h | 11 ++++++----- 5 files changed, 47 insertions(+), 45 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index f5fbb9a46784c..b92532211866d 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -33,7 +33,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", line->id, pos); - pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); + pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, + GFP_ATOMIC, pblk->bb_wq); } static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) @@ -1623,7 +1624,7 @@ void pblk_line_close_ws(struct work_struct *work) struct pblk_line *line = line_ws->line; pblk_line_close(pblk, line); - mempool_free(line_ws, pblk->line_ws_pool); + mempool_free(line_ws, pblk->gen_ws_pool); } void pblk_line_mark_bb(struct work_struct *work) @@ -1648,16 +1649,16 @@ void pblk_line_mark_bb(struct work_struct *work) } kfree(ppa); - mempool_free(line_ws, pblk->line_ws_pool); + mempool_free(line_ws, pblk->gen_ws_pool); } -void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq) { struct pblk_line_ws *line_ws; - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC); + line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); if (!line_ws) return; diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 6090d28f7995a..f163829ecca82 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -136,12 +136,12 @@ static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) static void pblk_gc_line_ws(struct work_struct *work) { - struct pblk_line_ws *line_rq_ws = container_of(work, + struct pblk_line_ws *gc_rq_ws = container_of(work, struct pblk_line_ws, ws); - struct pblk *pblk = line_rq_ws->pblk; + struct pblk *pblk = gc_rq_ws->pblk; struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = line_rq_ws->line; - struct pblk_gc_rq *gc_rq = line_rq_ws->priv; + struct pblk_line *line = gc_rq_ws->line; + struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; up(&gc->gc_sem); @@ -151,7 +151,7 @@ static void pblk_gc_line_ws(struct work_struct *work) gc_rq->nr_secs); } - mempool_free(line_rq_ws, pblk->line_ws_pool); + kfree(gc_rq_ws); } static void pblk_gc_line_prepare_ws(struct work_struct *work) @@ -164,7 +164,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) struct pblk_line_meta *lm = &pblk->lm; struct pblk_gc *gc = &pblk->gc; struct line_emeta *emeta_buf; - struct pblk_line_ws *line_rq_ws; + struct pblk_line_ws *gc_rq_ws; struct pblk_gc_rq *gc_rq; __le64 *lba_list; int sec_left, nr_secs, bit; @@ -223,19 +223,19 @@ next_rq: gc_rq->nr_secs = nr_secs; gc_rq->line = line; - line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); - if (!line_rq_ws) + gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); + if (!gc_rq_ws) goto fail_free_gc_rq; - line_rq_ws->pblk = pblk; - line_rq_ws->line = line; - line_rq_ws->priv = gc_rq; + gc_rq_ws->pblk = pblk; + gc_rq_ws->line = line; + gc_rq_ws->priv = gc_rq; down(&gc->gc_sem); kref_get(&line->ref); - INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); - queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); + INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); + queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); sec_left -= nr_secs; if (sec_left > 0) @@ -243,7 +243,7 @@ next_rq: out: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); - mempool_free(line_ws, pblk->line_ws_pool); + kfree(line_ws); kref_put(&line->ref, pblk_line_put); atomic_dec(&gc->inflight_gc); @@ -256,7 +256,7 @@ fail_free_emeta: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - mempool_free(line_ws, pblk->line_ws_pool); + kfree(line_ws); atomic_dec(&gc->inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); @@ -269,7 +269,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); if (!line_ws) return -ENOMEM; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 7b1f29c713381..3405522535802 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -20,7 +20,7 @@ #include "pblk.h" -static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, +static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, *pblk_w_rq_cache, *pblk_line_meta_cache; static DECLARE_RWSEM(pblk_lock); struct bio_set *pblk_bio_set; @@ -184,9 +184,9 @@ static int pblk_init_global_caches(struct pblk *pblk) char cache_name[PBLK_CACHE_NAME_LEN]; down_write(&pblk_lock); - pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws", + pblk_ws_cache = kmem_cache_create("pblk_blk_ws", sizeof(struct pblk_line_ws), 0, 0, NULL); - if (!pblk_blk_ws_cache) { + if (!pblk_ws_cache) { up_write(&pblk_lock); return -ENOMEM; } @@ -194,7 +194,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_rec_cache = kmem_cache_create("pblk_rec", sizeof(struct pblk_rec_ctx), 0, 0, NULL); if (!pblk_rec_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); up_write(&pblk_lock); return -ENOMEM; } @@ -202,7 +202,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, 0, 0, NULL); if (!pblk_g_rq_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); up_write(&pblk_lock); return -ENOMEM; @@ -211,7 +211,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, 0, 0, NULL); if (!pblk_w_rq_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); up_write(&pblk_lock); @@ -223,7 +223,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_line_meta_cache = kmem_cache_create(cache_name, pblk->lm.sec_bitmap_len, 0, 0, NULL); if (!pblk_line_meta_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); @@ -246,20 +246,20 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_init_global_caches(pblk)) return -ENOMEM; - /* internal bios can be at most the sectors signaled by the device. */ + /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), 0); if (!pblk->page_bio_pool) return -ENOMEM; - pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, - pblk_blk_ws_cache); - if (!pblk->line_ws_pool) + pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, + pblk_ws_cache); + if (!pblk->gen_ws_pool) goto free_page_bio_pool; pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); if (!pblk->rec_pool) - goto free_blk_ws_pool; + goto free_gen_ws_pool; pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, pblk_g_rq_cache); @@ -308,8 +308,8 @@ free_g_rq_pool: mempool_destroy(pblk->g_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); -free_blk_ws_pool: - mempool_destroy(pblk->line_ws_pool); +free_gen_ws_pool: + mempool_destroy(pblk->gen_ws_pool); free_page_bio_pool: mempool_destroy(pblk->page_bio_pool); return -ENOMEM; @@ -324,13 +324,13 @@ static void pblk_core_free(struct pblk *pblk) destroy_workqueue(pblk->bb_wq); mempool_destroy(pblk->page_bio_pool); - mempool_destroy(pblk->line_ws_pool); + mempool_destroy(pblk->gen_ws_pool); mempool_destroy(pblk->rec_pool); mempool_destroy(pblk->g_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index d82ca8bd83908..c73b17bca06b0 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -198,8 +198,8 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); if (sync == emeta->nr_entries) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, - pblk->close_wq); + pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, + GFP_ATOMIC, pblk->close_wq); bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 229f6020ad8ad..efaa781abb06c 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,7 +40,6 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) -#define PBLK_WS_POOL_SIZE (128) #define PBLK_META_POOL_SIZE (128) #define PBLK_READ_REQ_POOL_SIZE (1024) @@ -61,6 +60,8 @@ #define ERASE 2 /* READ = 0, WRITE = 1 */ +#define PBLK_GEN_WS_POOL_SIZE (2) + enum { /* IO Types */ PBLK_IOTYPE_USER = 1 << 0, @@ -621,7 +622,7 @@ struct pblk { struct list_head compl_list; mempool_t *page_bio_pool; - mempool_t *line_ws_pool; + mempool_t *gen_ws_pool; mempool_t *rec_pool; mempool_t *g_rq_pool; mempool_t *w_rq_pool; @@ -725,9 +726,9 @@ void pblk_line_close_meta_sync(struct pblk *pblk); void pblk_line_close_ws(struct work_struct *work); void pblk_pipeline_stop(struct pblk *pblk); void pblk_line_mark_bb(struct work_struct *work); -void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), - struct workqueue_struct *wq); +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, + struct workqueue_struct *wq); u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, -- cgit v1.2.3 From 0d880398cb6254ab3e110e2a8a659da65a56ffee Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:08 +0200 Subject: lightnvm: pblk: decouple read/erase mempools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since read and erase paths offer different guarantees for inflight I/Os, separate the mempools to set the right min_nr for each on creation. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 8 ++++---- drivers/lightnvm/pblk-init.c | 22 +++++++++++++++------- drivers/lightnvm/pblk.h | 5 +++-- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b92532211866d..0c22e5ccdfdd9 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -64,7 +64,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) struct pblk *pblk = rqd->private; __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, pblk->g_rq_pool); + mempool_free(rqd, pblk->e_rq_pool); } void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, @@ -161,7 +161,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { - pool = pblk->g_rq_pool; + pool = pblk->r_rq_pool; rq_size = pblk_g_rq_size; } @@ -178,7 +178,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) if (rw == WRITE) pool = pblk->w_rq_pool; else - pool = pblk->g_rq_pool; + pool = pblk->r_rq_pool; mempool_free(rqd, pool); } @@ -1479,7 +1479,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); + rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL); memset(rqd, 0, pblk_g_rq_size); pblk_setup_e_rq(pblk, rqd, ppa); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 3405522535802..2f8d3f9ffbafc 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -261,15 +261,20 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->rec_pool) goto free_gen_ws_pool; - pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, + pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk_g_rq_cache); - if (!pblk->g_rq_pool) + if (!pblk->r_rq_pool) goto free_rec_pool; - pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, + pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, + pblk_g_rq_cache); + if (!pblk->e_rq_pool) + goto free_r_rq_pool; + + pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk_w_rq_cache); if (!pblk->w_rq_pool) - goto free_g_rq_pool; + goto free_e_rq_pool; pblk->line_meta_pool = mempool_create_slab_pool(PBLK_META_POOL_SIZE, @@ -304,8 +309,10 @@ free_line_meta_pool: mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); -free_g_rq_pool: - mempool_destroy(pblk->g_rq_pool); +free_e_rq_pool: + mempool_destroy(pblk->e_rq_pool); +free_r_rq_pool: + mempool_destroy(pblk->r_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); free_gen_ws_pool: @@ -326,7 +333,8 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->page_bio_pool); mempool_destroy(pblk->gen_ws_pool); mempool_destroy(pblk->rec_pool); - mempool_destroy(pblk->g_rq_pool); + mempool_destroy(pblk->r_rq_pool); + mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index efaa781abb06c..419e1b7328e40 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -41,7 +41,6 @@ #define PBLK_MAX_REQ_ADDRS_PW (6) #define PBLK_META_POOL_SIZE (128) -#define PBLK_READ_REQ_POOL_SIZE (1024) #define PBLK_NR_CLOSE_JOBS (4) @@ -60,6 +59,7 @@ #define ERASE 2 /* READ = 0, WRITE = 1 */ +/* Static pool sizes */ #define PBLK_GEN_WS_POOL_SIZE (2) enum { @@ -624,8 +624,9 @@ struct pblk { mempool_t *page_bio_pool; mempool_t *gen_ws_pool; mempool_t *rec_pool; - mempool_t *g_rq_pool; + mempool_t *r_rq_pool; mempool_t *w_rq_pool; + mempool_t *e_rq_pool; mempool_t *line_meta_pool; struct workqueue_struct *close_wq; -- cgit v1.2.3 From e72ec1d31bcb6dffe325418c6d96f2fcab7c2654 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:09 +0200 Subject: lightnvm: pblk: do not use a mempool for line bitmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk holds two sector bitmaps: one to keep track of the mapped sectors while the line is active and another one to keep track of the invalid sectors. The latter is kept during the whole live of the line, until it is recycled. Since we cannot guarantee forward progress for the mempool in this case, get rid of the mempool and simply allocate memory through kmalloc. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 26 ++++++++++---------------- drivers/lightnvm/pblk-init.c | 29 ++--------------------------- drivers/lightnvm/pblk-recovery.c | 2 +- drivers/lightnvm/pblk-write.c | 4 +--- drivers/lightnvm/pblk.h | 3 --- 5 files changed, 14 insertions(+), 50 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0c22e5ccdfdd9..215aadb84c6e0 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1095,25 +1095,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; int blk_in_line = atomic_read(&line->blk_in_line); - line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); + line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC); if (!line->map_bitmap) return -ENOMEM; - memset(line->map_bitmap, 0, lm->sec_bitmap_len); - /* invalid_bitmap is special since it is used when line is closed. No - * need to zeroized; it will be initialized using bb info form - * map_bitmap - */ - line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); + /* will be initialized using bb info from map_bitmap */ + line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC); if (!line->invalid_bitmap) { - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); return -ENOMEM; } spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_FREE) { - mempool_free(line->invalid_bitmap, pblk->line_meta_pool); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); spin_unlock(&line->lock); WARN(1, "pblk: corrupted line %d, state %d\n", line->id, line->state); @@ -1165,7 +1161,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) { - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; @@ -1440,10 +1436,8 @@ retry_setup: void pblk_line_free(struct pblk *pblk, struct pblk_line *line) { - if (line->map_bitmap) - mempool_free(line->map_bitmap, pblk->line_meta_pool); - if (line->invalid_bitmap) - mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); *line->vsc = cpu_to_le32(EMPTY_ENTRY); @@ -1584,7 +1578,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) list_add_tail(&line->list, move_list); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 2f8d3f9ffbafc..4d719782f65b4 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -21,7 +21,7 @@ #include "pblk.h" static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, - *pblk_w_rq_cache, *pblk_line_meta_cache; + *pblk_w_rq_cache; static DECLARE_RWSEM(pblk_lock); struct bio_set *pblk_bio_set; @@ -181,8 +181,6 @@ static int pblk_set_ppaf(struct pblk *pblk) static int pblk_init_global_caches(struct pblk *pblk) { - char cache_name[PBLK_CACHE_NAME_LEN]; - down_write(&pblk_lock); pblk_ws_cache = kmem_cache_create("pblk_blk_ws", sizeof(struct pblk_line_ws), 0, 0, NULL); @@ -217,19 +215,6 @@ static int pblk_init_global_caches(struct pblk *pblk) up_write(&pblk_lock); return -ENOMEM; } - - snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s", - pblk->disk->disk_name); - pblk_line_meta_cache = kmem_cache_create(cache_name, - pblk->lm.sec_bitmap_len, 0, 0, NULL); - if (!pblk_line_meta_cache) { - kmem_cache_destroy(pblk_ws_cache); - kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_g_rq_cache); - kmem_cache_destroy(pblk_w_rq_cache); - up_write(&pblk_lock); - return -ENOMEM; - } up_write(&pblk_lock); return 0; @@ -276,16 +261,10 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->w_rq_pool) goto free_e_rq_pool; - pblk->line_meta_pool = - mempool_create_slab_pool(PBLK_META_POOL_SIZE, - pblk_line_meta_cache); - if (!pblk->line_meta_pool) - goto free_w_rq_pool; - pblk->close_wq = alloc_workqueue("pblk-close-wq", WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); if (!pblk->close_wq) - goto free_line_meta_pool; + goto free_w_rq_pool; pblk->bb_wq = alloc_workqueue("pblk-bb-wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); @@ -305,8 +284,6 @@ free_bb_wq: destroy_workqueue(pblk->bb_wq); free_close_wq: destroy_workqueue(pblk->close_wq); -free_line_meta_pool: - mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); free_e_rq_pool: @@ -336,13 +313,11 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->r_rq_pool); mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); - mempool_destroy(pblk->line_meta_pool); kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); - kmem_cache_destroy(pblk_line_meta_cache); } static void pblk_luns_free(struct pblk *pblk) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index caf1242795757..de5270712be77 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -987,7 +987,7 @@ next: list_move_tail(&line->list, move_list); spin_unlock(&l_mg->gc_lock); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index c73b17bca06b0..26c2b83451497 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -411,8 +411,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) if (emeta->mem >= lm->emeta_len[0]) { spin_lock(&l_mg->close_lock); list_del(&meta_line->list); - WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line), - "pblk: corrupt meta line %d\n", meta_line->id); spin_unlock(&l_mg->close_lock); } @@ -456,7 +454,7 @@ retry: return 0; } meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); - if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) + if (meta_line->emeta->mem >= lm->emeta_len[0]) goto retry; spin_unlock(&l_mg->close_lock); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 419e1b7328e40..60edcda0fc7f7 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,8 +40,6 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) -#define PBLK_META_POOL_SIZE (128) - #define PBLK_NR_CLOSE_JOBS (4) #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) @@ -627,7 +625,6 @@ struct pblk { mempool_t *r_rq_pool; mempool_t *w_rq_pool; mempool_t *e_rq_pool; - mempool_t *line_meta_pool; struct workqueue_struct *close_wq; struct workqueue_struct *bb_wq; -- cgit v1.2.3 From 2942f50fa389a62865572452dce6214a8aed69dc Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:10 +0200 Subject: lightnvm: pblk: remove checks on mempool alloc. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the mempool audit on pblk, remove unnecessary mempool allocation checks on mempools. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 4 ---- drivers/lightnvm/pblk-read.c | 8 -------- drivers/lightnvm/pblk-recovery.c | 35 +++++++---------------------------- drivers/lightnvm/pblk-write.c | 24 +++++------------------- 4 files changed, 12 insertions(+), 59 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 215aadb84c6e0..0da58869006b0 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -206,8 +206,6 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, for (i = 0; i < nr_pages; i++) { page = mempool_alloc(pblk->page_bio_pool, flags); - if (!page) - goto err; ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { @@ -1653,8 +1651,6 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, struct pblk_line_ws *line_ws; line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); - if (!line_ws) - return; line_ws->pblk = pblk; line_ws->line = line; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 402c732f0970a..d2b6e2a7d7d50 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -168,10 +168,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, DECLARE_COMPLETION_ONSTACK(wait); new_bio = bio_alloc(GFP_KERNEL, nr_holes); - if (!new_bio) { - pr_err("pblk: could not alloc read bio\n"); - return NVM_IO_ERR; - } if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) goto err; @@ -321,10 +317,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bitmap_zero(&read_bitmap, nr_secs); rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) { - pr_err_ratelimited("pblk: not able to alloc rqd"); - return NVM_IO_ERR; - } rqd->opcode = NVM_OP_PREAD; rqd->bio = bio; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index de5270712be77..6b6b4183b41e7 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work) max_secs); bio = bio_alloc(GFP_KERNEL, nr_rec_secs); - if (!bio) { - pr_err("pblk: not able to create recovery bio\n"); - return; - } bio->bi_iter.bi_sector = 0; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -85,11 +81,6 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; rec_rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rec_rqd)) { - pr_err("pblk: could not create recovery req.\n"); - return -ENOMEM; - } - rec_ctx = nvm_rq_to_pdu(rec_rqd); /* Copy completion bitmap, but exclude the first X completed entries */ @@ -404,22 +395,18 @@ next_pad_rq: ppa_list = (void *)(meta_list) + pblk_dma_meta_size; dma_ppa_list = dma_meta_list + pblk_dma_meta_size; - rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rqd)) { - ret = PTR_ERR(rqd); - goto fail_free_meta; - } - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - goto fail_free_rqd; + goto fail_free_meta; } bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + rqd = pblk_alloc_rqd(pblk, WRITE); + rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; rqd->flags = pblk_set_progr_mode(pblk, WRITE); @@ -490,8 +477,6 @@ free_rq: fail_free_bio: bio_put(bio); -fail_free_rqd: - pblk_free_rqd(pblk, rqd, WRITE); fail_free_meta: nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); fail_free_pad: @@ -785,15 +770,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) dma_addr_t dma_ppa_list, dma_meta_list; int done, ret = 0; - rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) - return PTR_ERR(rqd); - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) { - ret = -ENOMEM; - goto free_rqd; - } + if (!meta_list) + return -ENOMEM; ppa_list = (void *)(meta_list) + pblk_dma_meta_size; dma_ppa_list = dma_meta_list + pblk_dma_meta_size; @@ -804,6 +783,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) goto free_meta_list; } + rqd = pblk_alloc_rqd(pblk, READ); + p.ppa_list = ppa_list; p.meta_list = meta_list; p.rqd = rqd; @@ -832,8 +813,6 @@ out: kfree(data); free_meta_list: nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); -free_rqd: - pblk_free_rqd(pblk, rqd, READ); return ret; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 26c2b83451497..0fb8f26a6311a 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -111,10 +111,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) ppa_list = &rqd->ppa_addr; recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); - if (!recovery) { - pr_err("pblk: could not allocate recovery context\n"); - return; - } + INIT_LIST_HEAD(&recovery->failed); bit = -1; @@ -375,10 +372,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) int ret; rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) { - pr_err("pblk: cannot allocate write req.\n"); - return PTR_ERR(rqd); - } + m_ctx = nvm_rq_to_pdu(rqd); m_ctx->private = meta_line; @@ -546,19 +540,12 @@ static int pblk_submit_write(struct pblk *pblk) if (!secs_to_flush && secs_avail < pblk->min_write_pgs) return 1; - rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rqd)) { - pr_err("pblk: cannot allocate write req.\n"); - return 1; - } - bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); - if (!bio) { - pr_err("pblk: cannot allocate write bio\n"); - goto fail_free_rqd; - } + bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, WRITE); rqd->bio = bio; secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); @@ -589,7 +576,6 @@ fail_free_bio: pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); -fail_free_rqd: pblk_free_rqd(pblk, rqd, WRITE); return 1; -- cgit v1.2.3 From 3627896a4b12ea6bb9e0ff77724a24f53726db2d Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:11 +0200 Subject: lightnvm: pblk: use constant for GC max inflight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a constant to set the maximum number of inflight GC requests allowed. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 4 ++-- drivers/lightnvm/pblk.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index f163829ecca82..c21b2077432a0 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -93,7 +93,7 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) retry: spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_W_QD) { + if (gc->w_entries >= PBLK_GC_RQ_QD) { spin_unlock(&gc->w_lock); pblk_gc_writer_kick(&pblk->gc); usleep_range(128, 256); @@ -602,7 +602,7 @@ int pblk_gc_init(struct pblk *pblk) spin_lock_init(&gc->w_lock); spin_lock_init(&gc->r_lock); - sema_init(&gc->gc_sem, 128); + sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); INIT_LIST_HEAD(&gc->w_list); INIT_LIST_HEAD(&gc->r_list); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 60edcda0fc7f7..baa6a633990fa 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -816,7 +816,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, * pblk gc */ #define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ -#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ +#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ -- cgit v1.2.3 From 9f6cb13bb40bd9067498e908a3272aba998c0309 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:12 +0200 Subject: lightnvm: pblk: normalize ppa namings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Normalize the way we name ppa variables to improve code readability. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 48 +++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0da58869006b0..b6d7c66601491 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1746,7 +1746,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) { - struct ppa_addr l2p_ppa; + struct ppa_addr ppa_l2p; /* logic error: lba out-of-bounds. Ignore update */ if (!(lba < pblk->rl.nr_secs)) { @@ -1755,10 +1755,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) } spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); - if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa)) - pblk_map_invalidate(pblk, l2p_ppa); + if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) + pblk_map_invalidate(pblk, ppa_l2p); pblk_trans_map_set(pblk, lba, ppa); spin_unlock(&pblk->trans_lock); @@ -1775,16 +1775,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) pblk_update_map(pblk, lba, ppa); } -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, +int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, struct pblk_line *gc_line) { - struct ppa_addr l2p_ppa; + struct ppa_addr ppa_l2p; int ret = 1; #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); + BUG_ON(!pblk_addr_in_cache(ppa_new)); + BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); #endif /* logic error: lba out-of-bounds. Ignore update */ @@ -1794,36 +1794,38 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, } spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); /* Prevent updated entries to be overwritten by GC */ - if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) || - pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) { + if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) || + pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) { + ret = 0; goto out; } - pblk_trans_map_set(pblk, lba, ppa); + pblk_trans_map_set(pblk, lba, ppa_new); out: spin_unlock(&pblk->trans_lock); return ret; } -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct ppa_addr entry_line) +void pblk_update_map_dev(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) { - struct ppa_addr l2p_line; + struct ppa_addr ppa_l2p; #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa)); + BUG_ON(pblk_addr_in_cache(ppa_mapped)); #endif /* Invalidate and discard padded entries */ if (lba == ADDR_EMPTY) { #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->padded_wb); #endif - pblk_map_invalidate(pblk, ppa); + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); return; } @@ -1834,22 +1836,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, } spin_lock(&pblk->trans_lock); - l2p_line = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); /* Do not update L2P if the cacheline has been updated. In this case, * the mapped ppa must be invalidated */ - if (l2p_line.ppa != entry_line.ppa) { - if (!pblk_ppa_empty(ppa)) - pblk_map_invalidate(pblk, ppa); + if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); goto out; } #ifdef CONFIG_NVM_DEBUG - WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line)); + WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); #endif - pblk_trans_map_set(pblk, lba, ppa); + pblk_trans_map_set(pblk, lba, ppa_mapped); out: spin_unlock(&pblk->trans_lock); } -- cgit v1.2.3 From 84454e6de56bb5c8629c41ed09aaf5750ff56f5f Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:13 +0200 Subject: lightnvm: pblk: refactor read lba sanity check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor lba sanity check on read path to avoid code duplication. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d2b6e2a7d7d50..eaaf9d55ba97d 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -39,21 +39,14 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned long *read_bitmap) + sector_t blba, unsigned long *read_bitmap) { struct bio *bio = rqd->bio; struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; - sector_t blba = pblk_get_lba(bio); int nr_secs = rqd->nr_ppas; bool advanced_bio = false; int i, j = 0; - /* logic error: lba out-of-bounds. Ignore read request */ - if (blba + nr_secs >= pblk->rl.nr_secs) { - WARN(1, "pblk: read lbas out of bounds\n"); - return; - } - pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); for (i = 0; i < nr_secs; i++) { @@ -259,17 +252,10 @@ err: } static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned long *read_bitmap) + sector_t lba, unsigned long *read_bitmap) { struct bio *bio = rqd->bio; struct ppa_addr ppa; - sector_t lba = pblk_get_lba(bio); - - /* logic error: lba out-of-bounds. Ignore read request */ - if (lba >= pblk->rl.nr_secs) { - WARN(1, "pblk: read lba out of bounds\n"); - return; - } pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); @@ -305,14 +291,19 @@ retry: int pblk_submit_read(struct pblk *pblk, struct bio *bio) { struct nvm_tgt_dev *dev = pblk->dev; + sector_t blba = pblk_get_lba(bio); unsigned int nr_secs = pblk_get_secs(bio); struct nvm_rq *rqd; unsigned long read_bitmap; /* Max 64 ppas per request */ unsigned int bio_init_idx; int ret = NVM_IO_ERR; - if (nr_secs > PBLK_MAX_REQ_ADDRS) + /* logic error: lba out-of-bounds. Ignore read request */ + if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) { + WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n", + (unsigned long long)blba, nr_secs); return NVM_IO_ERR; + } bitmap_zero(&read_bitmap, nr_secs); @@ -340,9 +331,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; - pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); + pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap); } else { - pblk_read_rq(pblk, rqd, &read_bitmap); + pblk_read_rq(pblk, rqd, blba, &read_bitmap); } bio_get(bio); -- cgit v1.2.3 From d340121eb770de3b02bfc73c5f2b00f5345090c2 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:14 +0200 Subject: lightnvm: pblk: simplify data validity check on GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a line is selected for recycling by the garbage collector (GC), the line state changes and the invalid bitmap is frozen, preventing invalidations from happening. Throughout the GC, the L2P map is checked to verify that not data being recycled has been updated. The last check is done before the new map is being stored on the L2P table. Though this algorithm works, it requires a number of corner cases to be checked each time the L2P table is being updated. This complicates readability and is error prone in case that the recycling algorithm is modified. Instead, this patch makes the invalid bitmap accessible even when the line is being recycled. When recycled data is being remapped, it is enough to check the invalid bitmap for the line before updating the L2P table. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-cache.c | 20 +++++------ drivers/lightnvm/pblk-core.c | 29 +++++++--------- drivers/lightnvm/pblk-gc.c | 58 +++++++++++++++++-------------- drivers/lightnvm/pblk-rb.c | 6 ++-- drivers/lightnvm/pblk-read.c | 79 +++++++++++++++++++++++-------------------- drivers/lightnvm/pblk.h | 23 ++++--------- 6 files changed, 109 insertions(+), 106 deletions(-) diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 024a8fc93069e..1d6b8e3585f1e 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -73,12 +73,11 @@ out: * On GC the incoming lbas are not necessarily sequential. Also, some of the * lbas might not be valid entries, which are marked as empty by the GC thread */ -int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, - unsigned int nr_entries, unsigned int nr_rec_entries, - struct pblk_line *gc_line, unsigned long flags) +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct pblk_w_ctx w_ctx; unsigned int bpos, pos; + void *data = gc_rq->data; int i, valid_entries; /* Update the write buffer head (mem) with the entries that we can @@ -86,28 +85,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, * rollback from here on. */ retry: - if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) { + if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { io_schedule(); goto retry; } - w_ctx.flags = flags; + w_ctx.flags = PBLK_IOTYPE_GC; pblk_ppa_set_empty(&w_ctx.ppa); - for (i = 0, valid_entries = 0; i < nr_entries; i++) { - if (lba_list[i] == ADDR_EMPTY) + for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { + if (gc_rq->lba_list[i] == ADDR_EMPTY) continue; - w_ctx.lba = lba_list[i]; + w_ctx.lba = gc_rq->lba_list[i]; pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); - pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos); + pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, + gc_rq->paddr_list[i], pos); data += PBLK_EXPOSED_PAGE_SIZE; valid_entries++; } - WARN_ONCE(nr_rec_entries != valid_entries, + WARN_ONCE(gc_rq->secs_to_gc != valid_entries, "pblk: inconsistent GC write\n"); #ifdef CONFIG_NVM_DEBUG diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b6d7c66601491..6dd4866e579c0 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -78,11 +78,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, * that newer updates are not overwritten. */ spin_lock(&line->lock); - if (line->state == PBLK_LINESTATE_GC || - line->state == PBLK_LINESTATE_FREE) { - spin_unlock(&line->lock); - return; - } + WARN_ON(line->state == PBLK_LINESTATE_FREE); if (test_and_set_bit(paddr, line->invalid_bitmap)) { WARN_ONCE(1, "pblk: double invalidate\n"); @@ -99,8 +95,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, spin_lock(&l_mg->gc_lock); spin_lock(&line->lock); /* Prevent moving a line that has just been chosen for GC */ - if (line->state == PBLK_LINESTATE_GC || - line->state == PBLK_LINESTATE_FREE) { + if (line->state == PBLK_LINESTATE_GC) { spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); return; @@ -1766,6 +1761,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) { + #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a cache address */ BUG_ON(!pblk_addr_in_cache(ppa)); @@ -1776,9 +1772,9 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) } int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, - struct pblk_line *gc_line) + struct pblk_line *gc_line, u64 paddr_gc) { - struct ppa_addr ppa_l2p; + struct ppa_addr ppa_l2p, ppa_gc; int ret = 1; #ifdef CONFIG_NVM_DEBUG @@ -1795,10 +1791,13 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, spin_lock(&pblk->trans_lock); ppa_l2p = pblk_trans_map_get(pblk, lba); + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); - /* Prevent updated entries to be overwritten by GC */ - if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) || - pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) { + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { + spin_lock(&gc_line->lock); + WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), + "pblk: corrupted GC update"); + spin_unlock(&gc_line->lock); ret = 0; goto out; @@ -1870,15 +1869,13 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, u64 *lba_list, int nr_secs) { - sector_t lba; + u64 lba; int i; spin_lock(&pblk->trans_lock); for (i = 0; i < nr_secs; i++) { lba = lba_list[i]; - if (lba == ADDR_EMPTY) { - ppas[i].ppa = ADDR_EMPTY; - } else { + if (lba != ADDR_EMPTY) { /* logic error: lba out-of-bounds. Ignore update */ if (!(lba < pblk->rl.nr_secs)) { WARN(1, "pblk: corrupted L2P map request\n"); diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index c21b2077432a0..7ad0cfe58a21d 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -20,7 +20,8 @@ static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) { - vfree(gc_rq->data); + if (gc_rq->data) + vfree(gc_rq->data); kfree(gc_rq); } @@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk) spin_unlock(&gc->w_lock); list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { - pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list, - gc_rq->nr_secs, gc_rq->secs_to_gc, - gc_rq->line, PBLK_IOTYPE_GC); - + pblk_write_gc_to_cache(pblk, gc_rq); list_del(&gc_rq->list); kref_put(&gc_rq->line->ref, pblk_line_put); pblk_gc_free_gc_rq(gc_rq); @@ -69,27 +67,23 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) struct pblk_gc *gc = &pblk->gc; struct pblk_line *line = gc_rq->line; void *data; - unsigned int secs_to_gc; int ret = 0; data = vmalloc(gc_rq->nr_secs * geo->sec_size); if (!data) { ret = -ENOMEM; - goto out; + goto fail; } - /* Read from GC victim block */ - if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, - &secs_to_gc, line)) { - ret = -EFAULT; - goto free_data; - } + gc_rq->data = data; - if (!secs_to_gc) - goto free_rq; + /* Read from GC victim block */ + ret = pblk_submit_read_gc(pblk, gc_rq); + if (ret) + goto fail; - gc_rq->data = data; - gc_rq->secs_to_gc = secs_to_gc; + if (!gc_rq->secs_to_gc) + goto fail; retry: spin_lock(&gc->w_lock); @@ -107,11 +101,8 @@ retry: return 0; -free_rq: - kfree(gc_rq); -free_data: - vfree(data); -out: +fail: + pblk_gc_free_gc_rq(gc_rq); kref_put(&line->ref, pblk_line_put); return ret; } @@ -167,14 +158,21 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) struct pblk_line_ws *gc_rq_ws; struct pblk_gc_rq *gc_rq; __le64 *lba_list; + unsigned long *invalid_bitmap; int sec_left, nr_secs, bit; int ret; + invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!invalid_bitmap) { + pr_err("pblk: could not allocate GC invalid bitmap\n"); + goto fail_free_ws; + } + emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, GFP_KERNEL); if (!emeta_buf) { pr_err("pblk: cannot use GC emeta\n"); - return; + goto fail_free_bitmap; } ret = pblk_line_read_emeta(pblk, line, emeta_buf); @@ -193,7 +191,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) goto fail_free_emeta; } + spin_lock(&line->lock); + bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); sec_left = pblk_line_vsc(line); + spin_unlock(&line->lock); + if (sec_left < 0) { pr_err("pblk: corrupted GC line (%d)\n", line->id); goto fail_free_emeta; @@ -207,11 +209,12 @@ next_rq: nr_secs = 0; do { - bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, + bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, bit + 1); if (bit > line->emeta_ssec) break; + gc_rq->paddr_list[nr_secs] = bit; gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); } while (nr_secs < pblk->max_write_pgs); @@ -244,6 +247,7 @@ next_rq: out: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); kfree(line_ws); + kfree(invalid_bitmap); kref_put(&line->ref, pblk_line_put); atomic_dec(&gc->inflight_gc); @@ -254,9 +258,13 @@ fail_free_gc_rq: kfree(gc_rq); fail_free_emeta: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); +fail_free_bitmap: + kfree(invalid_bitmap); +fail_free_ws: + kfree(line_ws); + pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - kfree(line_ws); atomic_dec(&gc->inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 9bc32578a766e..74c768ce09ef0 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -325,8 +325,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, } void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, - unsigned int ring_pos) + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int ring_pos) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_rb_entry *entry; @@ -341,7 +341,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, __pblk_rb_write_entry(rb, data, w_ctx, entry); - if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line)) + if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) entry->w_ctx.lba = ADDR_EMPTY; flags = w_ctx.flags | PBLK_WRITTEN_DATA; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index eaaf9d55ba97d..c28d6509312e2 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -388,34 +388,40 @@ fail_rqd_free: static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_line *line, u64 *lba_list, - unsigned int nr_secs) + u64 *paddr_list_gc, unsigned int nr_secs) { - struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; + struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS]; + struct ppa_addr ppa_gc; int valid_secs = 0; int i; - pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs); + pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); for (i = 0; i < nr_secs; i++) { - if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id || - pblk_ppa_empty(ppas[i])) { - lba_list[i] = ADDR_EMPTY; + if (lba_list[i] == ADDR_EMPTY) + continue; + + ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); + if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { + paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; continue; } - rqd->ppa_list[valid_secs++] = ppas[i]; + rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; } #ifdef CONFIG_NVM_DEBUG atomic_long_add(valid_secs, &pblk->inflight_reads); #endif + return valid_secs; } static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, sector_t lba) + struct pblk_line *line, sector_t lba, + u64 paddr_gc) { - struct ppa_addr ppa; + struct ppa_addr ppa_l2p, ppa_gc; int valid_secs = 0; if (lba == ADDR_EMPTY) @@ -428,15 +434,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, } spin_lock(&pblk->trans_lock); - ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); spin_unlock(&pblk->trans_lock); - /* Ignore updated values until the moment */ - if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id || - pblk_ppa_empty(ppa)) + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) goto out; - rqd->ppa_addr = ppa; + rqd->ppa_addr = ppa_l2p; valid_secs = 1; #ifdef CONFIG_NVM_DEBUG @@ -447,15 +452,14 @@ out: return valid_secs; } -int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, - unsigned int nr_secs, unsigned int *secs_to_gc, - struct pblk_line *line) +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct bio *bio; struct nvm_rq rqd; - int ret, data_len; + int data_len; + int ret = NVM_IO_OK; DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -463,25 +467,29 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &rqd.dma_meta_list); if (!rqd.meta_list) - return NVM_IO_ERR; + return -ENOMEM; - if (nr_secs > 1) { + if (gc_rq->nr_secs > 1) { rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; - *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, - nr_secs); - if (*secs_to_gc == 1) + gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list, + gc_rq->paddr_list, + gc_rq->nr_secs); + if (gc_rq->secs_to_gc == 1) rqd.ppa_addr = rqd.ppa_list[0]; } else { - *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); + gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list[0], + gc_rq->paddr_list[0]); } - if (!(*secs_to_gc)) + if (!(gc_rq->secs_to_gc)) goto out; - data_len = (*secs_to_gc) * geo->sec_size; - bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, + data_len = (gc_rq->secs_to_gc) * geo->sec_size; + bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); @@ -494,13 +502,12 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.opcode = NVM_OP_PREAD; rqd.end_io = pblk_end_io_sync; rqd.private = &wait; - rqd.nr_ppas = *secs_to_gc; + rqd.nr_ppas = gc_rq->secs_to_gc; rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; - ret = pblk_submit_read_io(pblk, &rqd); - if (ret) { - bio_endio(bio); + if (pblk_submit_read_io(pblk, &rqd)) { + ret = -EIO; pr_err("pblk: GC read request failed\n"); goto err_free_bio; } @@ -519,19 +526,19 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, } #ifdef CONFIG_NVM_DEBUG - atomic_long_add(*secs_to_gc, &pblk->sync_reads); - atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads); - atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); + atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); + atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); + atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); #endif bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return NVM_IO_OK; + return ret; err_free_bio: bio_put(bio); err_free_dma: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return NVM_IO_ERR; + return ret; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index baa6a633990fa..876b50f97234a 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -206,6 +206,7 @@ struct pblk_lun { struct pblk_gc_rq { struct pblk_line *line; void *data; + u64 paddr_list[PBLK_MAX_REQ_ADDRS]; u64 lba_list[PBLK_MAX_REQ_ADDRS]; int nr_secs; int secs_to_gc; @@ -658,8 +659,8 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, struct pblk_w_ctx w_ctx, unsigned int pos); void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, - unsigned int pos); + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int pos); struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); void pblk_rb_flush(struct pblk_rb *rb); @@ -761,7 +762,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, struct ppa_addr entry_line); int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct pblk_line *gc_line); + struct pblk_line *gc_line, u64 paddr); void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, u64 *lba_list, int nr_secs); void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, @@ -772,9 +773,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, */ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags); -int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, - unsigned int nr_entries, unsigned int nr_rec_entries, - struct pblk_line *gc_line, unsigned long flags); +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk map @@ -798,9 +797,7 @@ void pblk_write_should_kick(struct pblk *pblk); */ extern struct bio_set *pblk_bio_set; int pblk_submit_read(struct pblk *pblk, struct bio *bio); -int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, - unsigned int nr_secs, unsigned int *secs_to_gc, - struct pblk_line *line); +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk recovery */ @@ -893,13 +890,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) static inline int pblk_line_vsc(struct pblk_line *line) { - int vsc; - - spin_lock(&line->lock); - vsc = le32_to_cpu(*line->vsc); - spin_unlock(&line->lock); - - return vsc; + return le32_to_cpu(*line->vsc); } #define NVM_MEM_PAGE_WRITE (8) -- cgit v1.2.3 From 2a19b10d423c6dc47449e905ed3a8eabb49c48a0 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:15 +0200 Subject: lightnvm: pblk: refactor read path on GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the part of the garbage collector where data is read from the line being recycled and moved into an internal queue before being copied to the memory buffer. This allows to get rid of a dedicated function, which introduces an unnecessary dependency on the code. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 94 +++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 55 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 7ad0cfe58a21d..7b103bce58bf7 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -56,57 +56,6 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc) wake_up_process(gc->gc_writer_ts); } -/* - * Responsible for managing all memory related to a gc request. Also in case of - * failure - */ -static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = gc_rq->line; - void *data; - int ret = 0; - - data = vmalloc(gc_rq->nr_secs * geo->sec_size); - if (!data) { - ret = -ENOMEM; - goto fail; - } - - gc_rq->data = data; - - /* Read from GC victim block */ - ret = pblk_submit_read_gc(pblk, gc_rq); - if (ret) - goto fail; - - if (!gc_rq->secs_to_gc) - goto fail; - -retry: - spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_RQ_QD) { - spin_unlock(&gc->w_lock); - pblk_gc_writer_kick(&pblk->gc); - usleep_range(128, 256); - goto retry; - } - gc->w_entries++; - list_add_tail(&gc_rq->list, &gc->w_list); - spin_unlock(&gc->w_lock); - - pblk_gc_writer_kick(&pblk->gc); - - return 0; - -fail: - pblk_gc_free_gc_rq(gc_rq); - kref_put(&line->ref, pblk_line_put); - return ret; -} - static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -130,18 +79,53 @@ static void pblk_gc_line_ws(struct work_struct *work) struct pblk_line_ws *gc_rq_ws = container_of(work, struct pblk_line_ws, ws); struct pblk *pblk = gc_rq_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; struct pblk_gc *gc = &pblk->gc; struct pblk_line *line = gc_rq_ws->line; struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; + int ret; up(&gc->gc_sem); - if (pblk_gc_move_valid_secs(pblk, gc_rq)) { - pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n", - line->id, *line->vsc, - gc_rq->nr_secs); + gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size); + if (!gc_rq->data) { + pr_err("pblk: could not GC line:%d (%d/%d)\n", + line->id, *line->vsc, gc_rq->nr_secs); + goto out; + } + + /* Read from GC victim block */ + ret = pblk_submit_read_gc(pblk, gc_rq); + if (ret) { + pr_err("pblk: failed GC read in line:%d (err:%d)\n", + line->id, ret); + goto out; } + if (!gc_rq->secs_to_gc) + goto out; + +retry: + spin_lock(&gc->w_lock); + if (gc->w_entries >= PBLK_GC_RQ_QD) { + spin_unlock(&gc->w_lock); + pblk_gc_writer_kick(&pblk->gc); + usleep_range(128, 256); + goto retry; + } + gc->w_entries++; + list_add_tail(&gc_rq->list, &gc->w_list); + spin_unlock(&gc->w_lock); + + pblk_gc_writer_kick(&pblk->gc); + + kfree(gc_rq_ws); + return; + +out: + pblk_gc_free_gc_rq(gc_rq); + kref_put(&line->ref, pblk_line_put); kfree(gc_rq_ws); } -- cgit v1.2.3 From 55e836d401601e7903b36db015ce899dc11085ab Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:16 +0200 Subject: lightnvm: pblk: put bio on bio completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify put bio by doing it on bio end_io instead of manually putting it on the completion path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 10 +++++++--- drivers/lightnvm/pblk-read.c | 1 - drivers/lightnvm/pblk-recovery.c | 1 - drivers/lightnvm/pblk-write.c | 8 +------- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 6dd4866e579c0..3d27d24baa0b2 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -417,6 +417,11 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) return nvm_submit_io(dev, rqd); } +static void pblk_bio_map_addr_endio(struct bio *bio) +{ + bio_put(bio); +} + struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, int alloc_type, gfp_t gfp_mask) @@ -453,6 +458,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, kaddr += PAGE_SIZE; } + + bio->bi_end_io = pblk_bio_map_addr_endio; out: return bio; } @@ -671,9 +678,6 @@ next_rq: atomic_dec(&pblk->inflight_io); reinit_completion(&wait); - if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META)) - bio_put(bio); - if (rqd.error) { if (dir == WRITE) pblk_log_write_err(pblk, &rqd); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index c28d6509312e2..e7141b1aaded4 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -531,7 +531,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); #endif - bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 6b6b4183b41e7..e59270e60b58e 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -333,7 +333,6 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); - bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, WRITE); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 0fb8f26a6311a..0c0481cf9f5d2 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -188,17 +188,12 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) pblk_log_write_err(pblk, rqd); pr_err("pblk: metadata I/O failed. Line %d\n", line->id); } -#ifdef CONFIG_NVM_DEBUG - else - WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); -#endif sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); if (sync == emeta->nr_entries) pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, GFP_ATOMIC, pblk->close_wq); - bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, READ); @@ -427,8 +422,7 @@ fail_rollback: nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); fail_free_bio: - if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) - bio_put(bio); + bio_put(bio); fail_free_rqd: pblk_free_rqd(pblk, rqd, READ); return ret; -- cgit v1.2.3 From 6ca2f71f3e3d94d188000b420ce0529b07f3ce95 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:17 +0200 Subject: lightnvm: pblk: simplify path on REQ_PREFLUSH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On REQ_PREFLUSH, directly tag the I/O context flags to signal a flush in the write to cache path, instead of finding the correct entry context and imposing a memory barrier. This simplifies the code and might potentially prevent race conditions when adding functionality to the write path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-cache.c | 4 +++- drivers/lightnvm/pblk-rb.c | 8 +------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 1d6b8e3585f1e..0d227ef7d1b95 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -43,8 +43,10 @@ retry: if (unlikely(!bio_has_data(bio))) goto out; - w_ctx.flags = flags; pblk_ppa_set_empty(&w_ctx.ppa); + w_ctx.flags = flags; + if (bio->bi_opf & REQ_PREFLUSH) + w_ctx.flags |= PBLK_FLUSH_ENTRY; for (i = 0; i < nr_entries; i++) { void *data = bio_data(bio); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 74c768ce09ef0..05e6b2e9221df 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -355,7 +355,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, { struct pblk_rb_entry *entry; unsigned int subm, sync_point; - int flags; subm = READ_ONCE(rb->subm); @@ -369,12 +368,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); entry = &rb->entries[sync_point]; - flags = READ_ONCE(entry->w_ctx.flags); - flags |= PBLK_FLUSH_ENTRY; - - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - /* Protect syncs */ smp_store_release(&rb->sync_point, sync_point); @@ -454,6 +447,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, /* Protect from read count */ smp_store_release(&rb->mem, mem); + return 1; } -- cgit v1.2.3 From 875d94f3a4838f2243334e5ce55ac8153f9bbf5b Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:18 +0200 Subject: lightnvm: pblk: allocate bio size more accurately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wait until we know the exact number of ppas to be sent to the device, before allocating the bio. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 5 +++-- drivers/lightnvm/pblk-write.c | 20 ++++++++++---------- drivers/lightnvm/pblk.h | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 05e6b2e9221df..1173e2380137d 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -552,12 +552,13 @@ out: * persist data on the write buffer to the media. */ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - struct bio *bio, unsigned int pos, - unsigned int nr_entries, unsigned int count) + unsigned int pos, unsigned int nr_entries, + unsigned int count) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct request_queue *q = pblk->dev->q; struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; struct pblk_rb_entry *entry; struct page *page; unsigned int pad = 0, to_read = nr_entries; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 0c0481cf9f5d2..140a26edd1d3d 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -534,24 +534,24 @@ static int pblk_submit_write(struct pblk *pblk) if (!secs_to_flush && secs_avail < pblk->min_write_pgs) return 1; - bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd = pblk_alloc_rqd(pblk, WRITE); - rqd->bio = bio; - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); if (secs_to_sync > pblk->max_write_pgs) { pr_err("pblk: bad buffer sync calculation\n"); - goto fail_put_bio; + return 1; } secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, + bio = bio_alloc(GFP_KERNEL, secs_to_sync); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, WRITE); + rqd->bio = bio; + + if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, secs_avail)) { pr_err("pblk: corrupted write bio\n"); goto fail_put_bio; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 876b50f97234a..9f162057d4971 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -666,8 +666,8 @@ void pblk_rb_flush(struct pblk_rb *rb); void pblk_rb_sync_l2p(struct pblk_rb *rb); unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - struct bio *bio, unsigned int pos, - unsigned int nr_entries, unsigned int count); + unsigned int pos, unsigned int nr_entries, + unsigned int count); unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, struct list_head *list, unsigned int max); -- cgit v1.2.3 From e2cddf2082e700218b898b1c899f6a1c2130074a Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:19 +0200 Subject: lightnvm: pblk: improve naming for internal req. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each request type sent to the LightNVM subsystem requires different metadata. Until now, we have tailored this metadata based on write, read and erase commands. However, pblk uses different metadata for internal writes that do not hit the write buffer. Instead of abusing the metadata for reads, create a new request type - internal write to improve code readability. In the process, create internal values for each I/O type instead of abusing the READ/WRITE macros, as suggested by Christoph. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-read.c | 6 +++--- drivers/lightnvm/pblk-recovery.c | 12 ++++++------ drivers/lightnvm/pblk-write.c | 16 ++++++++-------- drivers/lightnvm/pblk.h | 11 ++++++++--- 5 files changed, 41 insertions(+), 36 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 3d27d24baa0b2..a492964abea84 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -152,7 +152,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) struct nvm_rq *rqd; int rq_size; - if (rw == WRITE) { + if (rw == PBLK_WRITE) { pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { @@ -170,7 +170,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) { mempool_t *pool; - if (rw == WRITE) + if (rw == PBLK_WRITE) pool = pblk->w_rq_pool; else pool = pblk->r_rq_pool; @@ -569,10 +569,10 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, int ret; DECLARE_COMPLETION_ONSTACK(wait); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - } else if (dir == READ) { + } else if (dir == PBLK_READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; } else @@ -612,10 +612,10 @@ next_rq: rqd.end_io = pblk_end_io_sync; rqd.private = &wait; - if (dir == WRITE) { + if (dir == PBLK_WRITE) { struct pblk_sec_meta *meta_list = rqd.meta_list; - rqd.flags = pblk_set_progr_mode(pblk, WRITE); + rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE); for (i = 0; i < rqd.nr_ppas; ) { spin_lock(&line->lock); paddr = __pblk_alloc_page(pblk, line, min); @@ -679,7 +679,7 @@ next_rq: reinit_completion(&wait); if (rqd.error) { - if (dir == WRITE) + if (dir == PBLK_WRITE) pblk_log_write_err(pblk, &rqd); else pblk_log_read_err(pblk, &rqd); @@ -722,12 +722,12 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, int flags; DECLARE_COMPLETION_ONSTACK(wait); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - flags = pblk_set_progr_mode(pblk, WRITE); + flags = pblk_set_progr_mode(pblk, PBLK_WRITE); lba_list = emeta_to_lbas(pblk, line->emeta->buf); - } else if (dir == READ) { + } else if (dir == PBLK_READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -765,7 +765,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); meta_list[i].lba = lba_list[paddr] = addr_empty; @@ -791,7 +791,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, atomic_dec(&pblk->inflight_io); if (rqd.error) { - if (dir == WRITE) + if (dir == PBLK_WRITE) pblk_log_write_err(pblk, &rqd); else pblk_log_read_err(pblk, &rqd); @@ -807,14 +807,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) { u64 bpaddr = pblk_line_smeta_start(pblk, line); - return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); + return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); } int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, void *emeta_buf) { return pblk_line_submit_emeta_io(pblk, line, emeta_buf, - line->emeta_ssec, READ); + line->emeta_ssec, PBLK_READ); } static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -823,7 +823,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, rqd->opcode = NVM_OP_ERASE; rqd->ppa_addr = ppa; rqd->nr_ppas = 1; - rqd->flags = pblk_set_progr_mode(pblk, ERASE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE); rqd->bio = NULL; } @@ -1045,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, line->smeta_ssec = off; line->cur_sec = off + lm->smeta_sec; - if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { + if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { pr_debug("pblk: line smeta I/O failed. Retry\n"); return 1; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index e7141b1aaded4..4b1722fbe5a0c 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -142,7 +142,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); #endif - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_READ); atomic_dec(&pblk->inflight_io); } @@ -307,7 +307,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bitmap_zero(&read_bitmap, nr_secs); - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_READ); rqd->opcode = NVM_OP_PREAD; rqd->bio = bio; @@ -382,7 +382,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_OK; fail_rqd_free: - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_READ); return ret; } diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index e59270e60b58e..19f2fb1a9e4b2 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -67,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work) err: bio_put(bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); } int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, @@ -80,7 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, struct pblk_c_ctx *rec_ctx; int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; - rec_rqd = pblk_alloc_rqd(pblk, WRITE); + rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); rec_ctx = nvm_rq_to_pdu(rec_rqd); /* Copy completion bitmap, but exclude the first X completed entries */ @@ -334,7 +334,7 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); kref_put(&pad_rq->ref, pblk_recov_complete); @@ -404,11 +404,11 @@ next_pad_rq: bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, WRITE); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -782,7 +782,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) goto free_meta_list; } - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_READ); p.ppa_list = ppa_list; p.meta_list = meta_list; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 140a26edd1d3d..c1b8b83e149d9 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -46,7 +46,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); return ret; } @@ -195,7 +195,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) GFP_ATOMIC, pblk->close_wq); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); } @@ -209,7 +209,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, /* Setup write request */ rqd->opcode = NVM_OP_PWRITE; rqd->nr_ppas = nr_secs; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); rqd->private = pblk; rqd->end_io = end_io; @@ -275,7 +275,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); rqd->ppa_status = (u64)0; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); return ret; } @@ -366,7 +366,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) int i, j; int ret; - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); m_ctx = nvm_rq_to_pdu(rqd); m_ctx->private = meta_line; @@ -424,7 +424,7 @@ fail_rollback: fail_free_bio: bio_put(bio); fail_free_rqd: - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); return ret; } @@ -548,7 +548,7 @@ static int pblk_submit_write(struct pblk *pblk) bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, WRITE); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); rqd->bio = bio; if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, @@ -570,7 +570,7 @@ fail_free_bio: pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); return 1; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9f162057d4971..d01e003d3d742 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -55,11 +55,16 @@ for ((i) = 0, rlun = &(pblk)->luns[0]; \ (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) -#define ERASE 2 /* READ = 0, WRITE = 1 */ - /* Static pool sizes */ #define PBLK_GEN_WS_POOL_SIZE (2) +enum { + PBLK_READ = READ, + PBLK_WRITE = WRITE,/* Write from write buffer */ + PBLK_WRITE_INT, /* Internal write - no write buffer */ + PBLK_ERASE, +}; + enum { /* IO Types */ PBLK_IOTYPE_USER = 1 << 0, @@ -1132,7 +1137,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type) flags = geo->plane_mode >> 1; - if (type == WRITE) + if (type == PBLK_WRITE) flags |= NVM_IO_SCRAMBLE_ENABLE; return flags; -- cgit v1.2.3 From 67bf26a3220e3bd403a62a9289aa1d065d3db82c Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:20 +0200 Subject: lightnvm: pblk: refactor rqd alloc/free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the rqd allocation and free functions so that all I/O types can use these helper functions. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 40 ++++++++++++++++++++++++++++++---------- drivers/lightnvm/pblk-read.c | 2 -- drivers/lightnvm/pblk-recovery.c | 2 -- drivers/lightnvm/pblk-write.c | 7 ------- drivers/lightnvm/pblk.h | 4 ++-- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index a492964abea84..0de3875211b17 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -146,18 +146,26 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, spin_unlock(&pblk->trans_lock); } -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) +/* Caller must guarantee that the request is a valid type */ +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) { mempool_t *pool; struct nvm_rq *rqd; int rq_size; - if (rw == PBLK_WRITE) { + switch (type) { + case PBLK_WRITE: + case PBLK_WRITE_INT: pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; - } else { + break; + case PBLK_READ: pool = pblk->r_rq_pool; rq_size = pblk_g_rq_size; + break; + default: + pool = pblk->e_rq_pool; + rq_size = pblk_g_rq_size; } rqd = mempool_alloc(pool, GFP_KERNEL); @@ -166,15 +174,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) return rqd; } -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) +/* Typically used on completion path. Cannot guarantee request consistency */ +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) { + struct nvm_tgt_dev *dev = pblk->dev; mempool_t *pool; - if (rw == PBLK_WRITE) + switch (type) { + case PBLK_WRITE: + kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); + case PBLK_WRITE_INT: pool = pblk->w_rq_pool; - else + break; + case PBLK_READ: pool = pblk->r_rq_pool; + break; + case PBLK_ERASE: + pool = pblk->e_rq_pool; + break; + default: + pr_err("pblk: trying to free unknown rqd type\n"); + return; + } + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); mempool_free(rqd, pool); } @@ -1470,8 +1493,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_g_rq_size); + rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); pblk_setup_e_rq(pblk, rqd, ppa); @@ -1739,8 +1761,6 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, rlun = &pblk->luns[bit]; up(&rlun->wr_sem); } - - kfree(lun_bitmap); } void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4b1722fbe5a0c..d7c90c303540f 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -124,8 +124,6 @@ static void pblk_end_io_read(struct nvm_rq *rqd) WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - bio_put(bio); if (r_ctx->private) { struct bio *orig_bio = r_ctx->private; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 19f2fb1a9e4b2..686bc17f080ff 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -329,11 +329,9 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) { struct pblk_pad_rq *pad_rq = rqd->private; struct pblk *pblk = pad_rq->pblk; - struct nvm_tgt_dev *dev = pblk->dev; pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index c1b8b83e149d9..f2e846fe9242a 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -20,7 +20,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx) { - struct nvm_tgt_dev *dev = pblk->dev; struct bio *original_bio; unsigned long ret; int i; @@ -43,8 +42,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - bio_put(rqd->bio); pblk_free_rqd(pblk, rqd, PBLK_WRITE); @@ -176,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd) static void pblk_end_io_write_meta(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); struct pblk_line *line = m_ctx->private; struct pblk_emeta *emeta = line->emeta; @@ -194,7 +190,6 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, GFP_ATOMIC, pblk->close_wq); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); @@ -419,8 +414,6 @@ fail_rollback: pblk_dealloc_page(pblk, meta_line, rq_ppas); list_add(&meta_line->list, &meta_line->list); spin_unlock(&l_mg->close_lock); - - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); fail_free_bio: bio_put(bio); fail_free_rqd: diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index d01e003d3d742..12a20f800c261 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -699,11 +699,11 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); /* * pblk core */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw); void pblk_wait_for_meta(struct pblk *pblk); struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); -- cgit v1.2.3 From 26532ee52b77185b095d29b54c83386f737a74ba Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:21 +0200 Subject: lightnvm: pblk: use rqd->end_io for completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For consistency with the rest of pblk, use rqd->end_io to point to the function taking care of ending the request on the completion path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 7 ------- drivers/lightnvm/pblk-read.c | 5 ++--- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0de3875211b17..08d166ac4f3c0 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -261,13 +261,6 @@ void pblk_write_should_kick(struct pblk *pblk) pblk_write_kick(pblk); } -void pblk_end_bio_sync(struct bio *bio) -{ - struct completion *waiting = bio->bi_private; - - complete(waiting); -} - void pblk_end_io_sync(struct nvm_rq *rqd) { struct completion *waiting = rqd->private; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d7c90c303540f..0299fc08291d7 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -170,13 +170,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, new_bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(new_bio, REQ_OP_READ, 0); - new_bio->bi_private = &wait; - new_bio->bi_end_io = pblk_end_bio_sync; rqd->bio = new_bio; rqd->nr_ppas = nr_holes; rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); - rqd->end_io = NULL; + rqd->end_io = pblk_end_io_sync; + rqd->private = &wait; if (unlikely(nr_secs > 1 && nr_holes == 1)) { ppa_ptr = rqd->ppa_list; -- cgit v1.2.3 From a4809fee4e774fdf3296cc69c22ce6e6acef36b2 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:22 +0200 Subject: lightnvm: pblk: check lba sanity on read path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of pblk's recovery scheme, we store the lba mapped to each physical sector on the device's out-of-bound (OOB) area. On the read path, we can use this information to validate that the data being delivered to the upper layers corresponds to the lba being requested. The cost of this check is an extra copy on the DMA region on the device and an extra comparison in the host, given that (i) the OOB area is being read together with the data in the media, and (ii) the DMA region allocated for the ppa list can be reused for the metadata stored on the OOB area. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 51 +++++++++++++++++++++++++++++++++++++++++--- drivers/lightnvm/pblk.h | 4 +++- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 0299fc08291d7..a465d9980df4c 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -41,6 +41,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, sector_t blba, unsigned long *read_bitmap) { + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio *bio = rqd->bio; struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; int nr_secs = rqd->nr_ppas; @@ -56,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, retry: if (pblk_ppa_empty(p)) { WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); if (unlikely(!advanced_bio)) { bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); @@ -75,6 +77,7 @@ retry: goto retry; } WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(lba); advanced_bio = true; #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->cache_reads); @@ -110,10 +113,26 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_OK; } +static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, + sector_t blba) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + int nr_lbas = rqd->nr_ppas; + int i; + + for (i = 0; i < nr_lbas; i++) { + u64 lba = le64_to_cpu(meta_list[i].lba); + + if (lba == ADDR_EMPTY) + continue; + + WARN(lba != blba + i, "pblk: corrupted read LBA\n"); + } +} + static void pblk_end_io_read(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; @@ -124,6 +143,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd) WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif + pblk_read_check(pblk, rqd, r_ctx->lba); + bio_put(bio); if (r_ctx->private) { struct bio *orig_bio = r_ctx->private; @@ -149,15 +170,21 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, unsigned long *read_bitmap) { struct bio *new_bio, *bio = rqd->bio; + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio_vec src_bv, dst_bv; void *ppa_ptr = NULL; void *src_p, *dst_p; dma_addr_t dma_ppa_list = 0; + __le64 *lba_list_mem, *lba_list_media; int nr_secs = rqd->nr_ppas; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); int i, ret, hole; DECLARE_COMPLETION_ONSTACK(wait); + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); + new_bio = bio_alloc(GFP_KERNEL, nr_holes); if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) @@ -168,6 +195,9 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, goto err; } + for (i = 0; i < nr_secs; i++) + lba_list_mem[i] = meta_list[i].lba; + new_bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(new_bio, REQ_OP_READ, 0); @@ -207,10 +237,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->dma_ppa_list = dma_ppa_list; } + for (i = 0; i < nr_secs; i++) { + lba_list_media[i] = meta_list[i].lba; + meta_list[i].lba = lba_list_mem[i]; + } + /* Fill the holes in the original bio */ i = 0; hole = find_first_zero_bit(read_bitmap, nr_secs); do { + meta_list[hole].lba = lba_list_media[i]; + src_bv = new_bio->bi_io_vec[i++]; dst_bv = bio->bi_io_vec[bio_init_idx + hole]; @@ -251,6 +288,7 @@ err: static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, sector_t lba, unsigned long *read_bitmap) { + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio *bio = rqd->bio; struct ppa_addr ppa; @@ -263,6 +301,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, retry: if (pblk_ppa_empty(ppa)) { WARN_ON(test_and_set_bit(0, read_bitmap)); + meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); return; } @@ -274,6 +313,9 @@ retry: pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); goto retry; } + + meta_list[0].lba = cpu_to_le64(lba); + WARN_ON(test_and_set_bit(0, read_bitmap)); #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->cache_reads); @@ -290,9 +332,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) struct nvm_tgt_dev *dev = pblk->dev; sector_t blba = pblk_get_lba(bio); unsigned int nr_secs = pblk_get_secs(bio); + struct pblk_g_ctx *r_ctx; struct nvm_rq *rqd; - unsigned long read_bitmap; /* Max 64 ppas per request */ unsigned int bio_init_idx; + unsigned long read_bitmap; /* Max 64 ppas per request */ int ret = NVM_IO_ERR; /* logic error: lba out-of-bounds. Ignore read request */ @@ -312,6 +355,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->private = pblk; rqd->end_io = pblk_end_io_read; + r_ctx = nvm_rq_to_pdu(rqd); + r_ctx->lba = blba; + /* Save the index for this bio's start. This is needed in case * we need to fill a partial read. */ @@ -344,7 +390,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* All sectors are to be read from the device */ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); /* Clone read bio to deal with read errors internally */ int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 12a20f800c261..4a51e6d4d0361 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -99,6 +99,7 @@ enum { }; #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) +#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS) /* write buffer completion context */ struct pblk_c_ctx { @@ -110,9 +111,10 @@ struct pblk_c_ctx { unsigned int nr_padded; }; -/* generic context */ +/* read context */ struct pblk_g_ctx { void *private; + u64 lba; }; /* Pad context */ -- cgit v1.2.3 From 7bd4d370db6090004a06deb526f0f01fa99a3f9f Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:23 +0200 Subject: lightnvm: pblk: guarantee line integrity on reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a line is recycled during garbage collection, reads can still be issued to the line. If the line is freed in the middle of this process, data corruption might occur. This patch guarantees that lines are not freed in the middle of reads that target them (lines). Specifically, we use the existing line reference to decide when a line is eligible for being freed after the recycle process. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 56 ++++++++++++++++++++++++++++++---- drivers/lightnvm/pblk-init.c | 14 +++++++-- drivers/lightnvm/pblk-read.c | 71 +++++++++++++++++++++++++++++++++----------- drivers/lightnvm/pblk.h | 2 ++ 4 files changed, 118 insertions(+), 25 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 08d166ac4f3c0..0a41fb998d556 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1460,10 +1460,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) line->emeta = NULL; } -void pblk_line_put(struct kref *ref) +static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) { - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; struct pblk_line_mgmt *l_mg = &pblk->l_mg; spin_lock(&line->lock); @@ -1481,6 +1479,43 @@ void pblk_line_put(struct kref *ref) pblk_rl_free_lines_inc(&pblk->rl, line); } +static void pblk_line_put_ws(struct work_struct *work) +{ + struct pblk_line_ws *line_put_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = line_put_ws->pblk; + struct pblk_line *line = line_put_ws->line; + + __pblk_line_put(pblk, line); + mempool_free(line_put_ws, pblk->gen_ws_pool); +} + +void pblk_line_put(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + + __pblk_line_put(pblk, line); +} + +void pblk_line_put_wq(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + struct pblk_line_ws *line_put_ws; + + line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC); + if (!line_put_ws) + return; + + line_put_ws->pblk = pblk; + line_put_ws->line = line; + line_put_ws->priv = NULL; + + INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); + queue_work(pblk->r_end_wq, &line_put_ws->ws); +} + int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq *rqd; @@ -1878,8 +1913,19 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, int i; spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) - ppas[i] = pblk_trans_map_get(pblk, blba + i); + for (i = 0; i < nr_secs; i++) { + struct ppa_addr ppa; + + ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); + + /* If the L2P entry maps to a line, the reference is valid */ + if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { + int line_id = pblk_dev_ppa_to_line(ppa); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_get(&line->ref); + } + } spin_unlock(&pblk->trans_lock); } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 4d719782f65b4..34527646c01be 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -271,15 +271,22 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->bb_wq) goto free_close_wq; - if (pblk_set_ppaf(pblk)) + pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->r_end_wq) goto free_bb_wq; + if (pblk_set_ppaf(pblk)) + goto free_r_end_wq; + if (pblk_rwb_init(pblk)) - goto free_bb_wq; + goto free_r_end_wq; INIT_LIST_HEAD(&pblk->compl_list); return 0; +free_r_end_wq: + destroy_workqueue(pblk->r_end_wq); free_bb_wq: destroy_workqueue(pblk->bb_wq); free_close_wq: @@ -304,6 +311,9 @@ static void pblk_core_free(struct pblk *pblk) if (pblk->close_wq) destroy_workqueue(pblk->close_wq); + if (pblk->r_end_wq) + destroy_workqueue(pblk->r_end_wq); + if (pblk->bb_wq) destroy_workqueue(pblk->bb_wq); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index a465d9980df4c..402f8eff6a2e8 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -130,9 +130,34 @@ static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, } } -static void pblk_end_io_read(struct nvm_rq *rqd) +static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct ppa_addr *ppa_list; + int i; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + for (i = 0; i < rqd->nr_ppas; i++) { + struct ppa_addr ppa = ppa_list[i]; + struct pblk_line *line; + + line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + kref_put(&line->ref, pblk_line_put_wq); + } +} + +static void pblk_end_user_read(struct bio *bio) +{ +#ifdef CONFIG_NVM_DEBUG + WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); +#endif + bio_endio(bio); + bio_put(bio); +} + +static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, + bool put_line) { - struct pblk *pblk = rqd->private; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; @@ -146,15 +171,11 @@ static void pblk_end_io_read(struct nvm_rq *rqd) pblk_read_check(pblk, rqd, r_ctx->lba); bio_put(bio); - if (r_ctx->private) { - struct bio *orig_bio = r_ctx->private; + if (r_ctx->private) + pblk_end_user_read((struct bio *)r_ctx->private); -#ifdef CONFIG_NVM_DEBUG - WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); -#endif - bio_endio(orig_bio); - bio_put(orig_bio); - } + if (put_line) + pblk_read_put_rqd_kref(pblk, rqd); #ifdef CONFIG_NVM_DEBUG atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); @@ -165,6 +186,13 @@ static void pblk_end_io_read(struct nvm_rq *rqd) atomic_dec(&pblk->inflight_io); } +static void pblk_end_io_read(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + + __pblk_end_io_read(pblk, rqd, true); +} + static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, unsigned int bio_init_idx, unsigned long *read_bitmap) @@ -233,8 +261,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, } if (unlikely(nr_secs > 1 && nr_holes == 1)) { + struct ppa_addr ppa; + + ppa = rqd->ppa_addr; rqd->ppa_list = ppa_ptr; rqd->dma_ppa_list = dma_ppa_list; + rqd->ppa_list[0] = ppa; } for (i = 0; i < nr_secs; i++) { @@ -246,6 +278,11 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, i = 0; hole = find_first_zero_bit(read_bitmap, nr_secs); do { + int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_put(&line->ref, pblk_line_put); + meta_list[hole].lba = lba_list_media[i]; src_bv = new_bio->bi_io_vec[i++]; @@ -269,19 +306,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, bio_put(new_bio); /* Complete the original bio and associated request */ + bio_endio(bio); rqd->bio = bio; rqd->nr_ppas = nr_secs; - rqd->private = pblk; - bio_endio(bio); - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_OK; err: /* Free allocated pages in new bio */ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); - rqd->private = pblk; - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_ERR; } @@ -314,11 +349,11 @@ retry: goto retry; } + WARN_ON(test_and_set_bit(0, read_bitmap)); meta_list[0].lba = cpu_to_le64(lba); - WARN_ON(test_and_set_bit(0, read_bitmap)); #ifdef CONFIG_NVM_DEBUG - atomic_long_inc(&pblk->cache_reads); + atomic_long_inc(&pblk->cache_reads); #endif } else { rqd->ppa_addr = ppa; @@ -383,7 +418,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) if (bitmap_full(&read_bitmap, nr_secs)) { bio_endio(bio); atomic_inc(&pblk->inflight_io); - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_OK; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 4a51e6d4d0361..e4704373398bc 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -636,6 +636,7 @@ struct pblk { struct workqueue_struct *close_wq; struct workqueue_struct *bb_wq; + struct workqueue_struct *r_end_wq; struct timer_list wtimer; @@ -741,6 +742,7 @@ int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, void *emeta_buf); int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); void pblk_line_put(struct kref *ref); +void pblk_line_put_wq(struct kref *ref); struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -- cgit v1.2.3 From 0f9248cf1e22333b2a0458540aafb1ad3b2b3337 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:24 +0200 Subject: lightnvm: pblk: remove redundant check on read path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A partial read I/O in pblk is an I/O where some sectors reside in the write buffer in main memory and some are persisted on the device. Such an I/O must at least contain 2 lbas, therefore checking for the case where a single lba is mapped is not necessary. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 402f8eff6a2e8..71c58503f1a45 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -235,7 +235,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->end_io = pblk_end_io_sync; rqd->private = &wait; - if (unlikely(nr_secs > 1 && nr_holes == 1)) { + if (unlikely(nr_holes == 1)) { ppa_ptr = rqd->ppa_list; dma_ppa_list = rqd->dma_ppa_list; rqd->ppa_addr = rqd->ppa_list[0]; @@ -260,7 +260,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, #endif } - if (unlikely(nr_secs > 1 && nr_holes == 1)) { + if (unlikely(nr_holes == 1)) { struct ppa_addr ppa; ppa = rqd->ppa_addr; -- cgit v1.2.3 From 1e82123da6a4c6019ef03bcd47e4b3dc18dd136e Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:25 +0200 Subject: lightnvm: pblk: remove I/O dependency on write path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk schedules user I/O, metadata I/O and erases on the write path in order to minimize collisions at the media level. Until now, there has been a dependency between user and metadata I/Os that could lead to a deadlock as both take the per-LUN semaphore to schedule submission. This path removes this dependency and guarantees forward progress at a per I/O granurality. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-write.c | 145 +++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 80 deletions(-) diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index f2e846fe9242a..6c1cafafef532 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -220,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, } static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) + struct ppa_addr *erase_ppa) { struct pblk_line_meta *lm = &pblk->lm; struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); unsigned int valid = c_ctx->nr_valid; unsigned int padded = c_ctx->nr_padded; unsigned int nr_secs = valid + padded; unsigned long *lun_bitmap; - int ret = 0; + int ret; lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); if (!lun_bitmap) @@ -294,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, return secs_to_sync; } -static inline int pblk_valid_meta_ppa(struct pblk *pblk, - struct pblk_line *meta_line, - struct ppa_addr *ppa_list, int nr_ppas) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line *data_line; - struct ppa_addr ppa, ppa_opt; - u64 paddr; - int i; - - data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])]; - paddr = pblk_lookup_page(pblk, meta_line); - ppa = addr_to_gen_ppa(pblk, paddr, 0); - - if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap)) - return 1; - - /* Schedule a metadata I/O that is half the distance from the data I/O - * with regards to the number of LUNs forming the pblk instance. This - * balances LUN conflicts across every I/O. - * - * When the LUN configuration changes (e.g., due to GC), this distance - * can align, which would result on a LUN deadlock. In this case, modify - * the distance to not be optimal, but allow metadata I/Os to succeed. - */ - ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); - if (unlikely(ppa_opt.ppa == ppa.ppa)) { - data_line->meta_distance--; - return 0; - } - - for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) - if (ppa_list[i].g.ch == ppa_opt.g.ch && - ppa_list[i].g.lun == ppa_opt.g.lun) - return 1; - - if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) { - for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) - if (ppa_list[i].g.ch == ppa.g.ch && - ppa_list[i].g.lun == ppa.g.lun) - return 0; - - return 1; - } - - return 0; -} - int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) { struct nvm_tgt_dev *dev = pblk->dev; @@ -421,8 +373,44 @@ fail_free_rqd: return ret; } -static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, - int prev_n) +static inline bool pblk_valid_meta_ppa(struct pblk *pblk, + struct pblk_line *meta_line, + struct nvm_rq *data_rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); + struct pblk_line *data_line = pblk_line_get_data(pblk); + struct ppa_addr ppa, ppa_opt; + u64 paddr; + int pos_opt; + + /* Schedule a metadata I/O that is half the distance from the data I/O + * with regards to the number of LUNs forming the pblk instance. This + * balances LUN conflicts across every I/O. + * + * When the LUN configuration changes (e.g., due to GC), this distance + * can align, which would result on metadata and data I/Os colliding. In + * this case, modify the distance to not be optimal, but move the + * optimal in the right direction. + */ + paddr = pblk_lookup_page(pblk, meta_line); + ppa = addr_to_gen_ppa(pblk, paddr, 0); + ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); + pos_opt = pblk_ppa_to_pos(geo, ppa_opt); + + if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || + test_bit(pos_opt, data_line->blk_bitmap)) + return true; + + if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) + data_line->meta_distance--; + + return false; +} + +static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, + struct nvm_rq *data_rqd) { struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -432,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, retry: if (list_empty(&l_mg->emeta_list)) { spin_unlock(&l_mg->close_lock); - return 0; + return NULL; } meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); if (meta_line->emeta->mem >= lm->emeta_len[0]) goto retry; spin_unlock(&l_mg->close_lock); - if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) - return 0; + if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) + return NULL; - return pblk_submit_meta_io(pblk, meta_line); + return meta_line; } static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) { - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); struct ppa_addr erase_ppa; + struct pblk_line *meta_line; int err; ppa_set_empty(&erase_ppa); /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); + err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); if (err) { pr_err("pblk: could not setup write request: %d\n", err); return NVM_IO_ERR; } - if (likely(ppa_empty(erase_ppa))) { - /* Submit metadata write for previous data line */ - err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas); - if (err) { - pr_err("pblk: metadata I/O submission failed: %d", err); - return NVM_IO_ERR; - } + meta_line = pblk_should_submit_meta_io(pblk, rqd); - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } - } else { - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } - /* Submit available erase for next data line */ + if (!ppa_empty(erase_ppa)) { + /* Submit erase for next data line */ if (pblk_blk_erase_async(pblk, erase_ppa)) { struct pblk_line *e_line = pblk_line_get_erase(pblk); struct nvm_tgt_dev *dev = pblk->dev; @@ -495,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) } } + if (meta_line) { + /* Submit metadata write for previous data line */ + err = pblk_submit_meta_io(pblk, meta_line); + if (err) { + pr_err("pblk: metadata I/O submission failed: %d", err); + return NVM_IO_ERR; + } + } + return NVM_IO_OK; } -- cgit v1.2.3 From 21d2287119e843929c29fb1adbd271bde1fac7ae Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:26 +0200 Subject: lightnvm: pblk: enable 1 LUN configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Metadata I/Os are scheduled to minimize their impact on user data I/Os. When there are enough LUNs instantiated (i.e., enough bandwidth), it is easy to interleave metadata and data one after the other so that metadata I/Os are the ones being blocked and not vice-versa. We do this by calculating the distance between the I/Os in terms of the LUNs that are not in used, and selecting a free LUN that satisfies a the simple heuristic that metadata is scheduled behind. The per-LUN semaphores guarantee consistency. This works fine on >1 LUN configuration. However, when a single LUN is instantiated, this design leads to a deadlock, where metadata waits to be scheduled on a free LUN. This patch implements the 1 LUN case by simply scheduling the metadada I/O after the data I/O. In the process, we refactor the way a line is replaced to ensure that metadata writes are submitted after data writes in order to guarantee block sequentiality. Note that, since there is only one LUN, both I/Os will block each other by design. However, such configuration only pursues tight read latencies, not write bandwidth. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 17 ++++++++++------- drivers/lightnvm/pblk-init.c | 8 ++++++-- drivers/lightnvm/pblk-map.c | 21 ++++++++++++--------- drivers/lightnvm/pblk.h | 2 +- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0a41fb998d556..e38e91897246d 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1372,17 +1372,17 @@ void pblk_pipeline_stop(struct pblk *pblk) spin_unlock(&l_mg->free_lock); } -void pblk_line_replace_data(struct pblk *pblk) +struct pblk_line *pblk_line_replace_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *cur, *new; + struct pblk_line *cur, *new = NULL; unsigned int left_seblks; int is_next = 0; cur = l_mg->data_line; new = l_mg->data_next; if (!new) - return; + goto out; l_mg->data_line = new; spin_lock(&l_mg->free_lock); @@ -1390,7 +1390,7 @@ void pblk_line_replace_data(struct pblk *pblk) l_mg->data_line = NULL; l_mg->data_next = NULL; spin_unlock(&l_mg->free_lock); - return; + goto out; } pblk_line_setup_metadata(new, l_mg, &pblk->lm); @@ -1402,7 +1402,7 @@ retry_erase: /* If line is not fully erased, erase it */ if (atomic_read(&new->left_eblks)) { if (pblk_line_erase(pblk, new)) - return; + goto out; } else { io_schedule(); } @@ -1413,7 +1413,7 @@ retry_setup: if (!pblk_line_init_metadata(pblk, new, cur)) { new = pblk_line_retry(pblk, new); if (!new) - return; + goto out; goto retry_setup; } @@ -1421,7 +1421,7 @@ retry_setup: if (!pblk_line_init_bb(pblk, new, 1)) { new = pblk_line_retry(pblk, new); if (!new) - return; + goto out; goto retry_setup; } @@ -1445,6 +1445,9 @@ retry_setup: if (is_next) pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); + +out: + return new; } void pblk_line_free(struct pblk *pblk, struct pblk_line *line) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 34527646c01be..3c37491860537 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -711,8 +711,12 @@ add_emeta_page: } lm->emeta_bb = geo->nr_luns - i; - lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], - geo->sec_per_blk); + + lm->min_blk_line = 1; + if (geo->nr_luns > 1) + lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + + lm->emeta_sec[0], geo->sec_per_blk); + if (lm->min_blk_line > lm->blk_per_line) { pr_err("pblk: config. not supported. Min. LUN in line:%d\n", lm->blk_per_line); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index fddb924f6dde7..3bc4c94f9cf2d 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -25,13 +25,23 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); - struct pblk_emeta *emeta = line->emeta; + struct pblk_emeta *emeta; struct pblk_w_ctx *w_ctx; - __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); + __le64 *lba_list; u64 paddr; int nr_secs = pblk->min_write_pgs; int i; + if (pblk_line_is_full(line)) { + struct pblk_line *prev_line = line; + + line = pblk_line_replace_data(pblk); + pblk_line_close_meta(pblk, prev_line); + } + + emeta = line->emeta; + lba_list = emeta_to_lbas(pblk, emeta->buf); + paddr = pblk_alloc_page(pblk, line, nr_secs); for (i = 0; i < nr_secs; i++, paddr++) { @@ -60,13 +70,6 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, } } - if (pblk_line_is_full(line)) { - struct pblk_line *prev_line = line; - - pblk_line_replace_data(pblk); - pblk_line_close_meta(pblk, prev_line); - } - pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index e4704373398bc..191b1ec0627b0 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -719,7 +719,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -void pblk_line_replace_data(struct pblk *pblk); +struct pblk_line *pblk_line_replace_data(struct pblk *pblk); int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); struct pblk_line *pblk_line_get_data(struct pblk *pblk); -- cgit v1.2.3 From e6b754c252bacebdfbe3c57e790431ab8f445d1f Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:27 +0200 Subject: lightnvm: pblk: ensure right bad block calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that the variable controlling block threshold for allocating extra metadata sectors in case of a line with bad blocks does not get a negative value. Otherwise, the line will be marked as corrupted and wasted. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 3c37491860537..c7239c41ba40f 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -710,7 +710,7 @@ add_emeta_page: goto add_emeta_page; } - lm->emeta_bb = geo->nr_luns - i; + lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; lm->min_blk_line = 1; if (geo->nr_luns > 1) -- cgit v1.2.3 From 27b978725d895e704aab44b99242a0514485d798 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:28 +0200 Subject: lightnvm: pblk: fix changing GC group list for a line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk_line_gc_list seems to had a bug since the introduction of pblk in getting GC list for a line. In b20ba1bc7 while redesigning the GC algorithm, the naming for the GC thresholds was altered, but the values for high_thrs and mid_thrs were not. The result is that when moving to the GC lists, the mid threshold is never evaluated. Fixes: a4bd217b4("lightnvm: physical block device (pblk) target") Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index c7239c41ba40f..56ece7dfac0e6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -678,8 +678,8 @@ static int pblk_lines_init(struct pblk *pblk) lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); - lm->high_thrs = lm->sec_per_line / 2; - lm->mid_thrs = lm->sec_per_line / 4; + lm->mid_thrs = lm->sec_per_line / 2; + lm->high_thrs = lm->sec_per_line / 4; lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; /* Calculate necessary pages for smeta. See comment over struct -- cgit v1.2.3 From e480689bd1cc35f6ed3fa628bc8d913177b0726a Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:29 +0200 Subject: lightnvm: pblk: remove useless line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index e38e91897246d..43866ad87586a 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1071,7 +1071,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, /* Mark emeta metadata sectors as bad sectors. We need to consider bad * blocks to make sure that there are enough sectors to store emeta */ - bit = lm->sec_per_line; off = lm->sec_per_line - lm->emeta_sec[0]; bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); while (nr_bb) { -- cgit v1.2.3 From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:30 +0200 Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vblk isn't being used anyway and if we ever have a usecase we can introduce this again. This makes the logic easier and removes unnecessary checks. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 29 ++++++++++++----------------- include/linux/lightnvm.h | 2 +- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 798964f511cd0..231c928994315 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -633,7 +633,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, memset(&rqd, 0, sizeof(struct nvm_rq)); - nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); + nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); nvm_rq_tgt_to_dev(tgt_dev, &rqd); ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); @@ -697,7 +697,7 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, rqd.private = &wait; rqd.flags = geo->plane_mode >> 1; - ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); + ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); if (ret) return ret; @@ -793,14 +793,14 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) EXPORT_SYMBOL(nvm_put_area); int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas, int vblk) + const struct ppa_addr *ppas, int nr_ppas) { struct nvm_dev *dev = tgt_dev->parent; struct nvm_geo *geo = &tgt_dev->geo; int i, plane_cnt, pl_idx; struct ppa_addr ppa; - if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { + if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { rqd->nr_ppas = nr_ppas; rqd->ppa_addr = ppas[0]; @@ -814,19 +814,14 @@ int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, return -ENOMEM; } - if (!vblk) { - for (i = 0; i < nr_ppas; i++) - rqd->ppa_list[i] = ppas[i]; - } else { - plane_cnt = geo->plane_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } + plane_cnt = geo->plane_mode; + rqd->nr_ppas *= plane_cnt; + + for (i = 0; i < nr_ppas; i++) { + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; } } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7b80ac911d26e..32ec35b5e18be 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -481,7 +481,7 @@ extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int, int); + const struct ppa_addr *, int); extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:31 +0200 Subject: lightnvm: remove stale extern and unused exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all exported symbols are being used outside core and there were some stale entries in lightnvm.h Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 129 +++++++++++++++++++++++------------------------ include/linux/lightnvm.h | 7 --- 2 files changed, 64 insertions(+), 72 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 231c928994315..0e5f77234c79a 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = { .owner = THIS_MODULE, }; +static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) +{ + struct nvm_tgt_type *tmp, *tt = NULL; + + if (lock) + down_write(&nvm_tgtt_lock); + + list_for_each_entry(tmp, &nvm_tgt_types, list) + if (!strcmp(name, tmp->name)) { + tt = tmp; + break; + } + + if (lock) + up_write(&nvm_tgtt_lock); + return tt; +} + static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) { struct nvm_ioctl_create_simple *s = &create->conf.s; @@ -549,25 +567,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, } EXPORT_SYMBOL(nvm_part_to_tgt); -struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) -{ - struct nvm_tgt_type *tmp, *tt = NULL; - - if (lock) - down_write(&nvm_tgtt_lock); - - list_for_each_entry(tmp, &nvm_tgt_types, list) - if (!strcmp(name, tmp->name)) { - tt = tmp; - break; - } - - if (lock) - up_write(&nvm_tgtt_lock); - return tt; -} -EXPORT_SYMBOL(nvm_find_target_type); - int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; @@ -619,6 +618,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } +static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + const struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_geo *geo = &tgt_dev->geo; + int i, plane_cnt, pl_idx; + struct ppa_addr ppa; + + if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { + rqd->nr_ppas = nr_ppas; + rqd->ppa_addr = ppas[0]; + + return 0; + } + + rqd->nr_ppas = nr_ppas; + rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); + if (!rqd->ppa_list) { + pr_err("nvm: failed to allocate dma memory\n"); + return -ENOMEM; + } + + plane_cnt = geo->plane_mode; + rqd->nr_ppas *= plane_cnt; + + for (i = 0; i < nr_ppas; i++) { + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; + } + } + + return 0; +} + +static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, + struct nvm_rq *rqd) +{ + if (!rqd->ppa_list) + return; + + nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); +} + + int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas, int type) { @@ -792,52 +837,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) } EXPORT_SYMBOL(nvm_put_area); -int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &tgt_dev->geo; - int i, plane_cnt, pl_idx; - struct ppa_addr ppa; - - if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_ppas = nr_ppas; - rqd->ppa_addr = ppas[0]; - - return 0; - } - - rqd->nr_ppas = nr_ppas; - rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("nvm: failed to allocate dma memory\n"); - return -ENOMEM; - } - - plane_cnt = geo->plane_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } - } - - return 0; -} -EXPORT_SYMBOL(nvm_set_rqd_ppalist); - -void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - if (!rqd->ppa_list) - return; - - nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); -} -EXPORT_SYMBOL(nvm_free_rqd_ppalist); - void nvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_dev *tgt_dev = rqd->dev; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32ec35b5e18be..4f0e4a0fd2047 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -463,8 +463,6 @@ struct nvm_tgt_type { struct module *owner; }; -extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); - extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -480,9 +478,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int); -extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); @@ -491,8 +486,6 @@ extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -extern int nvm_dev_factory(struct nvm_dev *, int flags); - extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ -- cgit v1.2.3 From 05ed3447698203219319ec9d1c46303aff5932a2 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:32 +0200 Subject: lightnvm: pblk: reduce arguments in __pblk_rb_update_l2p MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already pass the structure pointer so no need to pass the member. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 1173e2380137d..b8f78e4014823 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) return subm; } -static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, - unsigned int to_update) +static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_line *line; @@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, int flags; for (i = 0; i < to_update; i++) { - entry = &rb->entries[*l2p_upd]; + entry = &rb->entries[rb->l2p_update]; w_ctx = &entry->w_ctx; flags = READ_ONCE(entry->w_ctx.flags); @@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; kref_put(&line->ref, pblk_line_put); clean_wctx(w_ctx); - *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); + rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); } pblk_rl_out(&pblk->rl, user_io, gc_io); @@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, count = nr_entries - space; /* l2p_update used exclusively under rb->w_lock */ - ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count); + ret = __pblk_rb_update_l2p(rb, count); out: return ret; @@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb) sync = smp_load_acquire(&rb->sync); to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); - __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update); + __pblk_rb_update_l2p(rb, to_update); spin_unlock(&rb->w_lock); } -- cgit v1.2.3 From 22a4e061ea11cc754785a12b3ba5f3e135bc0c63 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:33 +0200 Subject: lightnvm: pblk: fix releases of kmem cache in error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If pblk_core_init fails lets destroy all global caches. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 56ece7dfac0e6..2e599738372dd 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -220,6 +220,14 @@ static int pblk_init_global_caches(struct pblk *pblk) return 0; } +static void pblk_free_global_caches(struct pblk *pblk) +{ + kmem_cache_destroy(pblk_ws_cache); + kmem_cache_destroy(pblk_rec_cache); + kmem_cache_destroy(pblk_g_rq_cache); + kmem_cache_destroy(pblk_w_rq_cache); +} + static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; @@ -235,7 +243,7 @@ static int pblk_core_init(struct pblk *pblk) pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), 0); if (!pblk->page_bio_pool) - return -ENOMEM; + goto free_global_caches; pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, pblk_ws_cache); @@ -303,6 +311,8 @@ free_gen_ws_pool: mempool_destroy(pblk->gen_ws_pool); free_page_bio_pool: mempool_destroy(pblk->page_bio_pool); +free_global_caches: + pblk_free_global_caches(pblk); return -ENOMEM; } @@ -324,10 +334,7 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); - kmem_cache_destroy(pblk_ws_cache); - kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_g_rq_cache); - kmem_cache_destroy(pblk_w_rq_cache); + pblk_free_global_caches(pblk); } static void pblk_luns_free(struct pblk *pblk) -- cgit v1.2.3 From 3e3a5b8ebd5d3b1d68facc58b0674a2564653222 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:34 +0200 Subject: lightnvm: pblk: prevent gc kicks when gc is not operational MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GC can be kicked after it has been shut down when closing the last line during exit, resulting in accesses to freed structures. Make sure that GC is not triggered while it is not operational. Also make sure that GC won't be re-activated during exit when running on another processor by using timer_del_sync. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 9 +++++---- drivers/lightnvm/pblk-init.c | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 7b103bce58bf7..81efac18ff571 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -478,10 +478,10 @@ void pblk_gc_should_start(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - if (gc->gc_enabled && !gc->gc_active) + if (gc->gc_enabled && !gc->gc_active) { pblk_gc_start(pblk); - - pblk_gc_kick(pblk); + pblk_gc_kick(pblk); + } } /* @@ -620,7 +620,8 @@ void pblk_gc_exit(struct pblk *pblk) flush_workqueue(gc->gc_reader_wq); flush_workqueue(gc->gc_line_reader_wq); - del_timer(&gc->gc_timer); + gc->gc_enabled = 0; + del_timer_sync(&gc->gc_timer); pblk_gc_stop(pblk, 1); if (gc->gc_ts) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 2e599738372dd..27eb430958ff6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -931,6 +931,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->dev = dev; pblk->disk = tdisk; pblk->state = PBLK_STATE_RUNNING; + pblk->gc.gc_enabled = 0; spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); -- cgit v1.2.3 From 92957091e93931c91fccb7cce456312edeeea36c Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:35 +0200 Subject: lightnvm: pblk: recover partially written lines correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovering partially written lines, the valid sector count must be decreased by the number of padded sectors in the line. Update line recovery to take all ADDR_EMPTY(padded) sectors into account. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 686bc17f080ff..a080cf8889821 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -133,8 +133,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct pblk_emeta *emeta = line->emeta; struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; - int data_start; - int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; + int data_start, data_end; + int nr_valid_lbas, nr_lbas = 0; int i; lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); @@ -142,10 +142,10 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; + data_end = lm->sec_per_line - lm->emeta_sec[0]; nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); - for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { + for (i = data_start; i < data_end; i++) { struct ppa_addr ppa; int pos; -- cgit v1.2.3 From 37ce33d5756f4ba8bdd45371a1918ceeeba5b158 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:36 +0200 Subject: lightnvm: pblk: free full lines during recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When rebuilding the L2P table, any full lines (lines without any valid sectors) will be identified. If these lines are not freed, we risk not being able to allocate the first data line. This patch refactors the part of GC that frees empty lines into a separate function and adds a call to this after the L2P table has been rebuilt. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 32 ++++++++++++++++++++------------ drivers/lightnvm/pblk-init.c | 3 +++ drivers/lightnvm/pblk.h | 1 + 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 81efac18ff571..374089fe4326b 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -330,26 +330,16 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); } -/* - * Lines with no valid sectors will be returned to the free list immediately. If - * GC is activated - either because the free block count is under the determined - * threshold, or because it is being forced from user space - only lines with a - * high count of invalid sectors will be recycled. - */ -static void pblk_gc_run(struct pblk *pblk) +void pblk_gc_free_full_lines(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; struct pblk_line *line; - struct list_head *group_list; - bool run_gc; - int inflight_gc, gc_group = 0, prev_group = 0; do { spin_lock(&l_mg->gc_lock); if (list_empty(&l_mg->gc_full_list)) { spin_unlock(&l_mg->gc_lock); - break; + return; } line = list_first_entry(&l_mg->gc_full_list, @@ -365,6 +355,24 @@ static void pblk_gc_run(struct pblk *pblk) kref_put(&line->ref, pblk_line_put); } while (1); +} + +/* + * Lines with no valid sectors will be returned to the free list immediately. If + * GC is activated - either because the free block count is under the determined + * threshold, or because it is being forced from user space - only lines with a + * high count of invalid sectors will be recycled. + */ +static void pblk_gc_run(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + struct list_head *group_list; + bool run_gc; + int inflight_gc, gc_group = 0, prev_group = 0; + + pblk_gc_free_full_lines(pblk); run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 27eb430958ff6..f08fa2083fbcc 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -508,6 +508,9 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } } + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + if (!line) { /* Configure next line for user data */ line = pblk_line_get_first_data(pblk); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 191b1ec0627b0..21438d1550a24 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -831,6 +831,7 @@ void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); +void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); int pblk_gc_sysfs_force(struct pblk *pblk, int force); -- cgit v1.2.3 From 03661b5f756c92b9924869334a2afa19753c4fe7 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:37 +0200 Subject: lightnvm: pblk: start gc if needed during init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start GC if needed, directly after init, as we might need to garbage collect in order to make room for user writes. Create a helper function that allows to kick GC without exposing the internals of the GC/rate-limiter interaction. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 5 +++++ drivers/lightnvm/pblk-init.c | 4 ++++ drivers/lightnvm/pblk-rl.c | 2 +- drivers/lightnvm/pblk.h | 2 ++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 374089fe4326b..4bac9e1531f5d 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -510,6 +510,11 @@ void pblk_gc_should_stop(struct pblk *pblk) pblk_gc_stop(pblk, 0); } +void pblk_gc_should_kick(struct pblk *pblk) +{ + pblk_rl_update_rates(&pblk->rl); +} + void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active) { diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index f08fa2083fbcc..ad9f014a086b3 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -1025,6 +1025,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->rwb.nr_entries); wake_up_process(pblk->writer_ts); + + /* Check if we need to start GC */ + pblk_gc_should_kick(pblk); + return pblk; fail_stop_writer: diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 0896439a91b0f..739f855d42165 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -96,7 +96,7 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) * * Only the total number of free blocks is used to configure the rate limiter. */ -static void pblk_rl_update_rates(struct pblk_rl *rl) +void pblk_rl_update_rates(struct pblk_rl *rl) { struct pblk *pblk = container_of(rl, struct pblk, rl); unsigned long free_blocks = pblk_rl_nr_free_blks(rl); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 21438d1550a24..29ba7ec32b204 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -830,6 +830,7 @@ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); +void pblk_gc_should_kick(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, @@ -841,6 +842,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force); */ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); +void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); -- cgit v1.2.3 From 75610cd974aba4fadc9a8500d5470e8f28a3626f Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:38 +0200 Subject: lightnvm: pblk: consider bad sectors in emeta during recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovering lines we need to consider that bad blocks in a line affect the emeta area size. Previously it was assumed that the emeta area would grow by the number of sectors per page * number of bad blocks in the line. This assumption is not correct - the number of "extra" pages that are consumed could be both smaller (depending on emeta size) and bigger (depending on the placement of the bad blocks). Fix this by calculating the emeta start by iterating backwards through the line, skipping ppas that map to bad blocks. Also fix the data types used for ppa indices/counts in pblk_recov_l2p_from_emeta - we should use u64. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 44 +++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index a080cf8889821..9772a947ca4fb 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -133,16 +133,16 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct pblk_emeta *emeta = line->emeta; struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; - int data_start, data_end; - int nr_valid_lbas, nr_lbas = 0; - int i; + u64 data_start, data_end; + u64 nr_valid_lbas, nr_lbas = 0; + u64 i; lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); if (!lba_list) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - data_end = lm->sec_per_line - lm->emeta_sec[0]; + data_end = line->emeta_ssec; nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); for (i = data_start; i < data_end; i++) { @@ -172,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) } if (nr_valid_lbas != nr_lbas) - pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", - line->id, emeta_buf->nr_valid_lbas, nr_lbas); + pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", + line->id, nr_valid_lbas, nr_lbas); line->left_msecs = 0; @@ -827,10 +827,32 @@ static void pblk_recov_line_add_ordered(struct list_head *head, __list_add(&line->list, t->list.prev, &t->list); } -struct pblk_line *pblk_recov_l2p(struct pblk *pblk) +static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int emeta_secs; + u64 emeta_start; + struct ppa_addr ppa; + int pos; + + emeta_secs = lm->emeta_sec[0]; + emeta_start = lm->sec_per_line; + + while (emeta_secs) { + emeta_start--; + ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + if (!test_bit(pos, line->blk_bitmap)) + emeta_secs--; + } + + return emeta_start; +} + +struct pblk_line *pblk_recov_l2p(struct pblk *pblk) +{ struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line, *tline, *data_line = NULL; @@ -930,15 +952,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* Verify closed blocks and recover this portion of L2P table*/ list_for_each_entry_safe(line, tline, &recov_list, list) { - int off, nr_bb; - recovered_lines++; - /* Calculate where emeta starts based on the line bb */ - off = lm->sec_per_line - lm->emeta_sec[0]; - nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - off -= nr_bb * geo->sec_per_pl; - line->emeta_ssec = off; + line->emeta_ssec = pblk_line_emeta_start(pblk, line); line->emeta = emeta; memset(line->emeta->buf, 0, lm->emeta_len[0]); -- cgit v1.2.3 From 1edebacf8b736774d2f160512aec721f47e1f5ac Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:39 +0200 Subject: lightnvm: pblk: shut down gc gracefully during exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shut down the GC workqueues and tasks in the right order. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 4bac9e1531f5d..e00e5a0743e96 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -422,10 +422,15 @@ void pblk_gc_kick(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - wake_up_process(gc->gc_ts); pblk_gc_writer_kick(gc); pblk_gc_reader_kick(gc); - mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + + /* If we're shutting down GC, let's not start it up again */ + if (gc->gc_enabled) { + wake_up_process(gc->gc_ts); + mod_timer(&gc->gc_timer, + jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + } } static void pblk_gc_timer(unsigned long data) @@ -630,9 +635,6 @@ void pblk_gc_exit(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - flush_workqueue(gc->gc_reader_wq); - flush_workqueue(gc->gc_line_reader_wq); - gc->gc_enabled = 0; del_timer_sync(&gc->gc_timer); pblk_gc_stop(pblk, 1); @@ -640,15 +642,17 @@ void pblk_gc_exit(struct pblk *pblk) if (gc->gc_ts) kthread_stop(gc->gc_ts); + if (gc->gc_reader_ts) + kthread_stop(gc->gc_reader_ts); + + flush_workqueue(gc->gc_reader_wq); if (gc->gc_reader_wq) destroy_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); if (gc->gc_line_reader_wq) destroy_workqueue(gc->gc_line_reader_wq); if (gc->gc_writer_ts) kthread_stop(gc->gc_writer_ts); - - if (gc->gc_reader_ts) - kthread_stop(gc->gc_reader_ts); } -- cgit v1.2.3 From c55861926a78bf129e06bd3372b34225f4968757 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:40 +0200 Subject: lightnvm: pblk: add l2p crc debug printouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Print the CRC of the logical-to-physical mapping during exit and after recovering the L2P table to facilitate detection of meta data corruption/recovery issues. The CRC printed after recovery should match the CRC printed during the previous exit - if it doesn't this indicates that either the meta data written to the disk is corrupt or recovery failed. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index ad9f014a086b3..52c85f4f672de 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static size_t pblk_trans_map_size(struct pblk *pblk) +{ + int entry_size = 8; + + if (pblk->ppaf_bitsize < 32) + entry_size = 4; + + return entry_size * pblk->rl.nr_secs; +} + +#ifdef CONFIG_NVM_DEBUG +static u32 pblk_l2p_crc(struct pblk *pblk) +{ + size_t map_size; + u32 crc = ~(u32)0; + + map_size = pblk_trans_map_size(pblk); + crc = crc32_le(crc, pblk->trans_map, map_size); + return crc; +} +#endif + static void pblk_l2p_free(struct pblk *pblk) { vfree(pblk->trans_map); @@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk) { sector_t i; struct ppa_addr ppa; - int entry_size = 8; - - if (pblk->ppaf_bitsize < 32) - entry_size = 4; + size_t map_size; - pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs); + map_size = pblk_trans_map_size(pblk); + pblk->trans_map = vmalloc(map_size); if (!pblk->trans_map) return -ENOMEM; @@ -508,6 +528,10 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } } +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + /* Free full lines directly as GC has not been started yet */ pblk_gc_free_full_lines(pblk); @@ -901,6 +925,11 @@ static void pblk_exit(void *private) down_write(&pblk_lock); pblk_gc_exit(pblk); pblk_tear_down(pblk); + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + pblk_free(pblk); up_write(&pblk_lock); } -- cgit v1.2.3 From d6b992f7ab6279884238d4e2babf100c0879b3d6 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:41 +0200 Subject: lightnvm: pblk: gc all lines in the pipeline before exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finish garbage collect of the lines that are in the gc pipeline before exiting. Ensure that all lines already in in the pipeline goes through, from read to write. Do this by keeping track of how many lines are in the pipeline and waiting for that number to reach zero before exiting the gc reader task. Since we're adding a new gc line counter, change the name of inflight_gc to read_inflight_gc to make the distinction clear. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 3 +++ drivers/lightnvm/pblk-gc.c | 31 ++++++++++++++++++++++++------- drivers/lightnvm/pblk-sysfs.c | 2 +- drivers/lightnvm/pblk.h | 5 ++++- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 43866ad87586a..1cd27e38fc46a 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1465,6 +1465,7 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_GC); @@ -1473,6 +1474,8 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) pblk_line_free(pblk, line); spin_unlock(&line->lock); + atomic_dec(&gc->pipeline_gc); + spin_lock(&l_mg->free_lock); list_add_tail(&line->list, &l_mg->free_list); l_mg->nr_free_lines++; diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index e00e5a0743e96..e6fae1959e255 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -234,7 +234,7 @@ out: kfree(invalid_bitmap); kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->inflight_gc); + atomic_dec(&gc->read_inflight_gc); return; @@ -249,7 +249,7 @@ fail_free_ws: pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->inflight_gc); + atomic_dec(&gc->read_inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); } @@ -268,6 +268,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) line_ws->pblk = pblk; line_ws->line = line; + atomic_inc(&gc->pipeline_gc); INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); queue_work(gc->gc_reader_wq, &line_ws->ws); @@ -333,6 +334,7 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) void pblk_gc_free_full_lines(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; struct pblk_line *line; do { @@ -353,6 +355,7 @@ void pblk_gc_free_full_lines(struct pblk *pblk) list_del(&line->list); spin_unlock(&l_mg->gc_lock); + atomic_inc(&gc->pipeline_gc); kref_put(&line->ref, pblk_line_put); } while (1); } @@ -370,12 +373,12 @@ static void pblk_gc_run(struct pblk *pblk) struct pblk_line *line; struct list_head *group_list; bool run_gc; - int inflight_gc, gc_group = 0, prev_group = 0; + int read_inflight_gc, gc_group = 0, prev_group = 0; pblk_gc_free_full_lines(pblk); run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) + if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) return; next_gc_group: @@ -402,14 +405,14 @@ next_gc_group: list_add_tail(&line->list, &gc->r_list); spin_unlock(&gc->r_lock); - inflight_gc = atomic_inc_return(&gc->inflight_gc); + read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); pblk_gc_reader_kick(gc); prev_group = 1; /* No need to queue up more GC lines than we can handle */ run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || inflight_gc >= PBLK_GC_L_QD) + if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) break; } while (1); @@ -470,6 +473,7 @@ static int pblk_gc_writer_ts(void *data) static int pblk_gc_reader_ts(void *data) { struct pblk *pblk = data; + struct pblk_gc *gc = &pblk->gc; while (!kthread_should_stop()) { if (!pblk_gc_read(pblk)) @@ -478,6 +482,18 @@ static int pblk_gc_reader_ts(void *data) io_schedule(); } +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk: flushing gc pipeline, %d lines left\n", + atomic_read(&gc->pipeline_gc)); +#endif + + do { + if (!atomic_read(&gc->pipeline_gc)) + break; + + schedule(); + } while (1); + return 0; } @@ -586,7 +602,8 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_forced = 0; gc->gc_enabled = 1; gc->w_entries = 0; - atomic_set(&gc->inflight_gc, 0); + atomic_set(&gc->read_inflight_gc, 0); + atomic_set(&gc->pipeline_gc, 0); /* Workqueue that reads valid sectors from a line and submit them to the * GC writer to be recycled. diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 95fb434e2f01a..cd49e8875d4ec 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) sz += snprintf(page + sz, PAGE_SIZE - sz, "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", gc_full, gc_high, gc_mid, gc_low, gc_empty, - atomic_read(&pblk->gc.inflight_gc)); + atomic_read(&pblk->gc.read_inflight_gc)); sz += snprintf(page + sz, PAGE_SIZE - sz, "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 29ba7ec32b204..c6f8841973a0c 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -238,7 +238,10 @@ struct pblk_gc { struct timer_list gc_timer; struct semaphore gc_sem; - atomic_t inflight_gc; + atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ + atomic_t pipeline_gc; /* Number of lines in the GC pipeline - + * started reads to finished writes + */ int w_entries; struct list_head w_list; -- cgit v1.2.3 From 03e868eb8adb28e34f6e695667d230786bfdb653 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:42 +0200 Subject: lightnvm: pblk: correct valid lba count calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During garbage collect, lbas being written can end up being invalidated. Make sure that this is reflected in the valid lba count. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-map.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 3bc4c94f9cf2d..6f3ecde2140f3 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -45,6 +45,8 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, paddr = pblk_alloc_page(pblk, line, nr_secs); for (i = 0; i < nr_secs; i++, paddr++) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + /* ppa to be sent to the device */ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); @@ -61,10 +63,9 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, w_ctx->ppa = ppa_list[i]; meta_list[i].lba = cpu_to_le64(w_ctx->lba); lba_list[paddr] = cpu_to_le64(w_ctx->lba); - line->nr_valid_lbas++; + if (lba_list[paddr] != addr_empty) + line->nr_valid_lbas++; } else { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - lba_list[paddr] = meta_list[i].lba = addr_empty; __pblk_map_invalidate(pblk, line, paddr); } -- cgit v1.2.3 From 28bd109411eaa4c541f2e240d1285c154de4dfb7 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:43 +0200 Subject: lightnvm: pblk: remove spinlock when freeing line metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lockdep complains about being in atomic context while freeing line metadata - and rightly so as we take a spinlock and end up calling vfree that might sleep(in pblk_mfree). There is no need for holding the line manager free_lock while freeing line metadata as the pipeline as stopped, so remove the lock. Fixes: 588726d3ec68 ("lightnvm: pblk: fail gracefully on irrec. error") Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 52c85f4f672de..f62112ba5482a 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -393,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk) kfree(l_mg->bb_aux); kfree(l_mg->vsc_list); - spin_lock(&l_mg->free_lock); for (i = 0; i < PBLK_DATA_LINES; i++) { kfree(l_mg->sline_meta[i]); pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); kfree(l_mg->eline_meta[i]); } - spin_unlock(&l_mg->free_lock); kfree(pblk->lines); } -- cgit v1.2.3 From 8bd400204bd500bb2aea7b551f7c33bad2455340 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:44 +0200 Subject: lightnvm: pblk: cleanup unused and static functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup up unused and static functions across the whole codebase. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 133 ++++++++++++++++++++----------------------- drivers/lightnvm/pblk-gc.c | 40 ++++++------- drivers/lightnvm/pblk-rl.c | 10 ---- drivers/lightnvm/pblk.h | 14 ++--- 4 files changed, 86 insertions(+), 111 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 1cd27e38fc46a..4199119a0754a 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -18,6 +18,31 @@ #include "pblk.h" +static void pblk_line_mark_bb(struct work_struct *work) +{ + struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, + ws); + struct pblk *pblk = line_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa = line_ws->priv; + int ret; + + ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); + if (ret) { + struct pblk_line *line; + int pos; + + line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; + pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); + + pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", + line->id, pos); + } + + kfree(ppa); + mempool_free(line_ws, pblk->gen_ws_pool); +} + static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa) { @@ -268,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -void pblk_wait_for_meta(struct pblk *pblk) +static void pblk_wait_for_meta(struct pblk *pblk) { do { if (!atomic_read(&pblk->inflight_io)) @@ -345,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio) pblk_invalidate_range(pblk, slba, nr_secs); } -struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba) -{ - struct ppa_addr ppa; - - spin_lock(&pblk->trans_lock); - ppa = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - return ppa; -} - void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) { atomic_long_inc(&pblk->write_failed); @@ -1338,6 +1352,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) pblk->state = PBLK_STATE_STOPPING; } +static void pblk_line_close_meta_sync(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line, *tline; + LIST_HEAD(list); + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return; + } + + list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); + spin_unlock(&l_mg->close_lock); + + list_for_each_entry_safe(line, tline, &list, list) { + struct pblk_emeta *emeta = line->emeta; + + while (emeta->mem < lm->emeta_len[0]) { + int ret; + + ret = pblk_submit_meta_io(pblk, line); + if (ret) { + pr_err("pblk: sync meta line %d failed (%d)\n", + line->id, ret); + return; + } + } + } + + pblk_wait_for_meta(pblk); + flush_workqueue(pblk->close_wq); +} + void pblk_pipeline_stop(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1565,41 +1614,6 @@ int pblk_line_is_full(struct pblk_line *line) return (line->left_msecs == 0); } -void pblk_line_close_meta_sync(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line, *tline; - LIST_HEAD(list); - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return; - } - - list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); - spin_unlock(&l_mg->close_lock); - - list_for_each_entry_safe(line, tline, &list, list) { - struct pblk_emeta *emeta = line->emeta; - - while (emeta->mem < lm->emeta_len[0]) { - int ret; - - ret = pblk_submit_meta_io(pblk, line); - if (ret) { - pr_err("pblk: sync meta line %d failed (%d)\n", - line->id, ret); - return; - } - } - } - - pblk_wait_for_meta(pblk); - flush_workqueue(pblk->close_wq); -} - static void pblk_line_should_sync_meta(struct pblk *pblk) { if (pblk_rl_is_limit(&pblk->rl)) @@ -1673,31 +1687,6 @@ void pblk_line_close_ws(struct work_struct *work) mempool_free(line_ws, pblk->gen_ws_pool); } -void pblk_line_mark_bb(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa = line_ws->priv; - int ret; - - ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); - if (ret) { - struct pblk_line *line; - int pos; - - line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; - pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); - - pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", - line->id, pos); - } - - kfree(ppa); - mempool_free(line_ws, pblk->gen_ws_pool); -} - void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index e6fae1959e255..b8323e34d1bcc 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -275,6 +275,26 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) return 0; } +static void pblk_gc_reader_kick(struct pblk_gc *gc) +{ + wake_up_process(gc->gc_reader_ts); +} + +static void pblk_gc_kick(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + + pblk_gc_writer_kick(gc); + pblk_gc_reader_kick(gc); + + /* If we're shutting down GC, let's not start it up again */ + if (gc->gc_enabled) { + wake_up_process(gc->gc_ts); + mod_timer(&gc->gc_timer, + jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + } +} + static int pblk_gc_read(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; @@ -298,11 +318,6 @@ static int pblk_gc_read(struct pblk *pblk) return 0; } -static void pblk_gc_reader_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_reader_ts); -} - static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, struct list_head *group_list) { @@ -421,21 +436,6 @@ next_gc_group: goto next_gc_group; } -void pblk_gc_kick(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - pblk_gc_writer_kick(gc); - pblk_gc_reader_kick(gc); - - /* If we're shutting down GC, let's not start it up again */ - if (gc->gc_enabled) { - wake_up_process(gc->gc_ts); - mod_timer(&gc->gc_timer, - jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - } -} - static void pblk_gc_timer(unsigned long data) { struct pblk *pblk = (struct pblk *)data; diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 739f855d42165..abae31fd434e5 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -153,16 +153,6 @@ int pblk_rl_high_thrs(struct pblk_rl *rl) return rl->high; } -int pblk_rl_low_thrs(struct pblk_rl *rl) -{ - return rl->low; -} - -int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) -{ - return rl->rb_user_max; -} - int pblk_rl_max_io(struct pblk_rl *rl) { return rl->rb_max_io; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index c6f8841973a0c..6c9ea9a93704f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -710,8 +710,6 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); -void pblk_wait_for_meta(struct pblk *pblk); -struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); @@ -732,10 +730,8 @@ int pblk_line_is_full(struct pblk_line *line); void pblk_line_free(struct pblk *pblk, struct pblk_line *line); void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); void pblk_line_close(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_meta_sync(struct pblk *pblk); void pblk_line_close_ws(struct work_struct *work); void pblk_pipeline_stop(struct pblk *pblk); -void pblk_line_mark_bb(struct work_struct *work); void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq); @@ -759,7 +755,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, unsigned long *lun_bitmap); -void pblk_end_bio_sync(struct bio *bio); void pblk_end_io_sync(struct nvm_rq *rqd); int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int nr_pages); @@ -834,7 +829,6 @@ void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); void pblk_gc_should_kick(struct pblk *pblk); -void pblk_gc_kick(struct pblk *pblk); void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); @@ -847,7 +841,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); -int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); @@ -855,11 +848,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); int pblk_rl_max_io(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); int pblk_rl_is_limit(struct pblk_rl *rl); /* @@ -868,6 +859,11 @@ int pblk_rl_is_limit(struct pblk_rl *rl); int pblk_sysfs_init(struct gendisk *tdisk); void pblk_sysfs_exit(struct gendisk *tdisk); +static inline void test(size_t a) +{ + a += 1; +} + static inline void *pblk_malloc(size_t size, int type, gfp_t flags) { if (type == PBLK_KMALLOC_META) -- cgit v1.2.3 From 8da10cce7c7f7f9f5edc77271cf6e0c45b762004 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:45 +0200 Subject: lightnvm: pblk: avoid being reported as hung on rated GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The amount of GC I/O on the write buffer is managed by the rate-limiter, which is calculated as a function of the number of available free blocks. When reaching the stable point, we risk having scheduled more I/Os for GC than are allowed on the write buffer. This would result on the GC semaphore balancing the outstanding read GC I/Os to be reported as "hung", though the behavior is normal. Solve this by allowing to schedule when we detect that the read GC path is not moving forward. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index b8323e34d1bcc..00d5698d64a9a 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -218,7 +218,13 @@ next_rq: gc_rq_ws->line = line; gc_rq_ws->priv = gc_rq; - down(&gc->gc_sem); + /* The write GC path can be much slower than the read GC one due to + * the budget imposed by the rate-limiter. Balance in case that we get + * back pressure from the write GC path. + */ + while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) + io_schedule(); + kref_get(&line->ref); INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); -- cgit v1.2.3 From 1b839187db62aae22a9254fd9e3ce376149b0918 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:46 +0200 Subject: lightnvm: fail fast on passthrough commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make LightNVM passhtrough commands fail fast. User space will then take care of re-submitting. Fixes: 84d4add793c6 ('lightnvm: add ioctls for vector I/Os') Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 1f79e3f141e64..6017153c24394 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -600,8 +600,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; - rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); if (!ppa_list) { -- cgit v1.2.3 From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001 From: Javier González Date: Fri, 13 Oct 2017 14:46:47 +0200 Subject: lightnvm: implement generic path for sync I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a generic path for sending sync I/O on LightNVM. This allows to reuse the standard synchronous path trough blk_execute_rq(), instead of implementing a wait_for_completion on the target side (e.g., pblk). Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 25 +++++++++----- drivers/lightnvm/pblk-core.c | 74 ++++++++++++---------------------------- drivers/lightnvm/pblk-read.c | 21 ++---------- drivers/lightnvm/pblk-recovery.c | 31 ++--------------- drivers/lightnvm/pblk.h | 42 +++++++++++++++++++++-- drivers/nvme/host/lightnvm.c | 70 +++++++++++++++++++++++++++++-------- include/linux/lightnvm.h | 3 ++ 7 files changed, 143 insertions(+), 123 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0e5f77234c79a..fe21f4dd33e91 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -720,12 +720,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_submit_io); -static void nvm_end_io_sync(struct nvm_rq *rqd) +int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { - struct completion *waiting = rqd->private; + struct nvm_dev *dev = tgt_dev->parent; + int ret; + + if (!dev->ops->submit_io_sync) + return -ENODEV; + + nvm_rq_tgt_to_dev(tgt_dev, rqd); - complete(waiting); + rqd->dev = tgt_dev; + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io_sync(dev, rqd); + nvm_rq_dev_to_tgt(tgt_dev, rqd); + + return ret; } +EXPORT_SYMBOL(nvm_submit_io_sync); int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas) @@ -733,25 +746,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, struct nvm_geo *geo = &tgt_dev->geo; struct nvm_rq rqd; int ret; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); rqd.opcode = NVM_OP_ERASE; - rqd.end_io = nvm_end_io_sync; - rqd.private = &wait; rqd.flags = geo->plane_mode >> 1; ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); if (ret) return ret; - ret = nvm_submit_io(tgt_dev, &rqd); + ret = nvm_submit_io_sync(tgt_dev, &rqd); if (ret) { pr_err("rrpr: erase I/O submission failed: %d\n", ret); goto free_ppa_list; } - wait_for_completion_io(&wait); free_ppa_list: nvm_free_rqd_ppalist(tgt_dev, &rqd); diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 4199119a0754a..ce90213a42fae 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -412,39 +412,33 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) struct nvm_tgt_dev *dev = pblk->dev; #ifdef CONFIG_NVM_DEBUG - struct ppa_addr *ppa_list; + int ret; - ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; - if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { - WARN_ON(1); - return -EINVAL; - } + ret = pblk_check_io(pblk, rqd); + if (ret) + return ret; +#endif - if (rqd->opcode == NVM_OP_PWRITE) { - struct pblk_line *line; - struct ppa_addr ppa; - int i; + atomic_inc(&pblk->inflight_io); - for (i = 0; i < rqd->nr_ppas; i++) { - ppa = ppa_list[i]; - line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + return nvm_submit_io(dev, rqd); +} - spin_lock(&line->lock); - if (line->state != PBLK_LINESTATE_OPEN) { - pr_err("pblk: bad ppa: line:%d,state:%d\n", - line->id, line->state); - WARN_ON(1); - spin_unlock(&line->lock); - return -EINVAL; - } - spin_unlock(&line->lock); - } - } +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + +#ifdef CONFIG_NVM_DEBUG + int ret; + + ret = pblk_check_io(pblk, rqd); + if (ret) + return ret; #endif atomic_inc(&pblk->inflight_io); - return nvm_submit_io(dev, rqd); + return nvm_submit_io_sync(dev, rqd); } static void pblk_bio_map_addr_endio(struct bio *bio) @@ -597,7 +591,6 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, int cmd_op, bio_op; int i, j; int ret; - DECLARE_COMPLETION_ONSTACK(wait); if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; @@ -639,8 +632,6 @@ next_rq: rqd.dma_ppa_list = dma_ppa_list; rqd.opcode = cmd_op; rqd.nr_ppas = rq_ppas; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; if (dir == PBLK_WRITE) { struct pblk_sec_meta *meta_list = rqd.meta_list; @@ -694,19 +685,14 @@ next_rq: } } - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { pr_err("pblk: emeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_rqd_dma; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: emeta I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); if (rqd.error) { if (dir == PBLK_WRITE) @@ -750,7 +736,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, int i, ret; int cmd_op, bio_op; int flags; - DECLARE_COMPLETION_ONSTACK(wait); if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; @@ -787,8 +772,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.opcode = cmd_op; rqd.flags = flags; rqd.nr_ppas = lm->smeta_sec; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; for (i = 0; i < lm->smeta_sec; i++, paddr++) { struct pblk_sec_meta *meta_list = rqd.meta_list; @@ -807,17 +790,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, * the write thread is the only one sending write and erase commands, * there is no need to take the LUN semaphore. */ - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { pr_err("pblk: smeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_ppa_list; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: smeta I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); if (rqd.error) { @@ -861,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq rqd; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); pblk_setup_e_rq(pblk, &rqd, ppa); - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; - /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -886,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) goto out; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: sync erase timed out\n"); - } - out: rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 71c58503f1a45..ca79d8fb3e607 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -207,7 +207,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, int nr_secs = rqd->nr_ppas; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); int i, ret, hole; - DECLARE_COMPLETION_ONSTACK(wait); /* Re-use allocated memory for intermediate lbas */ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); @@ -232,8 +231,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->bio = new_bio; rqd->nr_ppas = nr_holes; rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (unlikely(nr_holes == 1)) { ppa_ptr = rqd->ppa_list; @@ -241,18 +238,13 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->ppa_addr = rqd->ppa_list[0]; } - ret = pblk_submit_read_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { bio_put(rqd->bio); - pr_err("pblk: read IO submission failed\n"); + pr_err("pblk: sync read IO submission failed\n"); goto err; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: partial read I/O timed out\n"); - } - if (rqd->error) { atomic_long_inc(&pblk->read_failed); #ifdef CONFIG_NVM_DEBUG @@ -537,7 +529,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) struct nvm_rq rqd; int data_len; int ret = NVM_IO_OK; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -577,22 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) bio_set_op_attrs(bio, REQ_OP_READ, 0); rqd.opcode = NVM_OP_PREAD; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; rqd.nr_ppas = gc_rq->secs_to_gc; rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; - if (pblk_submit_read_io(pblk, &rqd)) { + if (pblk_submit_io_sync(pblk, &rqd)) { ret = -EIO; pr_err("pblk: GC read request failed\n"); goto err_free_bio; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: GC read I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); if (rqd.error) { diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 9772a947ca4fb..eadb3eb5d4dc8 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -216,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, int rq_ppas, rq_len; int i, j; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -253,8 +252,6 @@ next_read_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -280,19 +277,13 @@ next_read_rq: } /* If read fails, more padding is needed */ - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - return -EINTR; - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* At this point, the read should not fail. If it does, it is a problem * we cannot recover from here. Need FTL log. @@ -504,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, int ret = 0; int rec_round; int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -539,8 +529,6 @@ next_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -566,18 +554,13 @@ next_rq: addr_to_gen_ppa(pblk, w_ptr, line->id); } - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* This should not happen since the read failed during normal recovery, * but the media works funny sometimes... @@ -645,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, int i, j; int ret = 0; int left_ppas = pblk_calc_sec_in_line(pblk, line); - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -678,8 +660,6 @@ next_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -705,19 +685,14 @@ next_rq: addr_to_gen_ppa(pblk, paddr, line->id); } - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); bio_put(bio); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* Reached the end of the written line */ if (rqd->error) { diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 6c9ea9a93704f..6b64288de6f76 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -714,6 +714,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, @@ -1203,7 +1204,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); } -#endif static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas) @@ -1224,14 +1224,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, ppa->g.sec < geo->sec_per_pg) continue; -#ifdef CONFIG_NVM_DEBUG print_ppa(ppa, "boundary", i); -#endif + return 1; } return 0; } +static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa_list; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { + WARN_ON(1); + return -EINVAL; + } + + if (rqd->opcode == NVM_OP_PWRITE) { + struct pblk_line *line; + struct ppa_addr ppa; + int i; + + for (i = 0; i < rqd->nr_ppas; i++) { + ppa = ppa_list[i]; + line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + + spin_lock(&line->lock); + if (line->state != PBLK_LINESTATE_OPEN) { + pr_err("pblk: bad ppa: line:%d,state:%d\n", + line->id, line->state); + WARN_ON(1); + spin_unlock(&line->lock); + return -EINVAL; + } + spin_unlock(&line->lock); + } + } + + return 0; +} +#endif + static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) { struct pblk_line_meta *lm = &pblk->lm; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 6017153c24394..8fc949c5b49b7 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static struct request *nvme_nvm_alloc_request(struct request_queue *q, + struct nvm_rq *rqd, + struct nvme_nvm_command *cmd) { - struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; - struct bio *bio = rqd->bio; - struct nvme_nvm_command *cmd; - - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); - if (!cmd) - return -ENOMEM; nvme_nvm_rqtocmd(rqd, ns, cmd); rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); - if (IS_ERR(rq)) { - kfree(cmd); - return PTR_ERR(rq); - } + if (IS_ERR(rq)) + return rq; + rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (bio) { - blk_init_request_from_bio(rq, bio); + if (rqd->bio) { + blk_init_request_from_bio(rq, rqd->bio); } else { rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); rq->__data_len = 0; } + return rq; +} + +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct nvme_nvm_command *cmd; + struct request *rq; + + cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rq = nvme_nvm_alloc_request(q, rqd, cmd); + if (IS_ERR(rq)) { + kfree(cmd); + return PTR_ERR(rq); + } + rq->end_io_data = rqd; blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); @@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) return 0; } +static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct request *rq; + struct nvme_nvm_command cmd; + int ret = 0; + + memset(&cmd, 0, sizeof(struct nvme_nvm_command)); + + rq = nvme_nvm_alloc_request(q, rqd, &cmd); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + /* I/Os can fail and the error is signaled through rqd. Callers must + * handle the error accordingly. + */ + blk_execute_rq(q, NULL, rq, 0); + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + ret = -EINTR; + + rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); + rqd->error = nvme_req(rq)->status; + + blk_mq_free_request(rq); + + return ret; +} + static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { struct nvme_ns *ns = nvmdev->q->queuedata; @@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .set_bb_tbl = nvme_nvm_set_bb_tbl, .submit_io = nvme_nvm_submit_io, + .submit_io_sync = nvme_nvm_submit_io_sync, .create_dma_pool = nvme_nvm_create_dma_pool, .destroy_dma_pool = nvme_nvm_destroy_dma_pool, diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4f0e4a0fd2047..b7f111ff4d3b3 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -56,6 +56,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -69,6 +70,7 @@ struct nvm_dev_ops { nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; + nvm_submit_io_sync_fn *submit_io_sync; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; @@ -477,6 +479,7 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); +extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From cdd094fd0ad750c94ccaa5b2ee84fd8264f6f433 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Oct 2017 09:36:06 -0600 Subject: Revert "lightnvm: prevent bd removal if busy" Christoph correctly points out that this issue is no different for other block devices, and poking at cross layer internals is not the right way to solve it. This reverts commit bb6aa6f08268bbce4e0185b18cab9e04505d6695. Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index fe21f4dd33e91..83249b43dd069 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -390,7 +390,6 @@ static void __nvm_remove_target(struct nvm_target *t) static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) { struct nvm_target *t; - struct block_device *bdev; mutex_lock(&dev->mlock); t = nvm_find_target(dev, remove->tgtname); @@ -398,19 +397,6 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) mutex_unlock(&dev->mlock); return 1; } - bdev = bdget_disk(t->disk, 0); - if (!bdev) { - pr_err("nvm: removal failed, allocating bd failed\n"); - mutex_unlock(&dev->mlock); - return -ENOMEM; - } - if (bdev->bd_super || bdev->bd_part_count) { - pr_err("nvm: removal failed, block device busy\n"); - bdput(bdev); - mutex_unlock(&dev->mlock); - return -EBUSY; - } - bdput(bdev); __nvm_remove_target(t); mutex_unlock(&dev->mlock); -- cgit v1.2.3 From 23c4490d2b7f78a855f9c5675e26735fb61afe84 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 14 Oct 2017 00:26:28 +0800 Subject: null_blk: update usage hints for submit_queues update the range of submits_queues, and correct usage hints. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- Documentation/block/null_blk.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 3140dbd860d8c..1f8d92c704585 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -55,10 +55,10 @@ irqmode=[0-2]: Default: 1-Soft-irq completion_nsec=[ns]: Default: 10.000ns Combined with irqmode=2 (timer). The time each completion event must wait. -submit_queues=[0..nr_cpus]: +submit_queues=[1..nr_cpus]: The number of submission queues attached to the device driver. If unset, it - defaults to 1 on single-queue and bio-based instances. For multi-queue, - it is ignored when use_per_node_hctx module parameter is 1. + defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module + parameter is 1. hw_queue_depth=[0..qdepth]: Default: 64 The hardware queue depth of the device. -- cgit v1.2.3 From fc186311f2233bb3f69f22b96babdf16a57c88a9 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 14 Oct 2017 00:26:54 +0800 Subject: null_blk: add usage hints for no_sched This parameter provide an option to disable io scheduler when nullb* in multi-queue mode. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- Documentation/block/null_blk.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 1f8d92c704585..b7161c8e70f2e 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -73,3 +73,7 @@ use_per_node_hctx=[0/1]: Default: 0 use_lightnvm=[0/1]: Default: 0 Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled. + +no_sched=[0/1]: Default: 0 + 0: nullb* use default blk-mq io scheduler. + 1: nullb* doesn't use io scheduler. -- cgit v1.2.3 From 515c24c13c7ff1262cdb40fe631c6391e99c0996 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 14 Oct 2017 16:38:27 +0800 Subject: mm/page-writeback.c: make changes of dirty_writeback_centisecs take effect immediately This patch is the followup of the prvious patch: [writeback: schedule periodic writeback with sysctl]. There's another issue to fix. For example, - When the tunable was set to one hour and is reset to one second, the new setting will not take effect for up to one hour. Kicking the flusher threads immediately fixes it. Cc: Jens Axboe Cc: Jan Kara Cc: Andrew Morton Signed-off-by: Yafang Shao Signed-off-by: Jens Axboe --- mm/page-writeback.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 622a18c114caa..c518c845f2025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1976,7 +1976,16 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, int ret; ret = proc_dointvec(table, write, buffer, length, ppos); - if (!ret && !old_interval && dirty_writeback_interval) + + /* + * Writing 0 to dirty_writeback_interval will disable periodic writeback + * and a different non-zero value will wakeup the writeback threads. + * wb_wakeup_delayed() would be more appropriate, but it's a pain to + * iterate over all bdis and wbs. + * The reason we do this is to make the change take effect immediately. + */ + if (!ret && write && dirty_writeback_interval && + dirty_writeback_interval != old_interval) wakeup_flusher_threads(WB_REASON_PERIODIC); return ret; -- cgit v1.2.3 From 761f2e1ed8822cf12378c857de337290d0c82a81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 18:46:46 +0200 Subject: nvme: simplify compat_ioctl handling We can just use our normal ioctl handler for the compat case and remove the boilerplate code for it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy --- drivers/nvme/host/core.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 26c8913435b25..573cc3b59bfad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1052,16 +1052,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, } } -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - static int nvme_open(struct block_device *bdev, fmode_t mode) { return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; @@ -1380,7 +1370,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, + .compat_ioctl = nvme_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, -- cgit v1.2.3 From 58f913dce2814a9ea7260e93ed3a949e0d5565e3 Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Fri, 13 Oct 2017 16:35:28 -0700 Subject: bcache: Avoid nested function definition Fixes below error with clang: ../drivers/md/bcache/sysfs.c:759:3: error: function definition is not allowed here { return *((uint16_t *) r) - *((uint16_t *) l); } ^ ../drivers/md/bcache/sysfs.c:789:32: error: use of undeclared identifier 'cmp' sort(p, n, sizeof(uint16_t), cmp, NULL); ^ 2 errors generated. v2: rename function to __bch_cache_cmp Signed-off-by: Peter Foley Reviewed-by: Coly Li Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 104c57cd666ce..69f355b9650ca 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -745,6 +745,11 @@ static struct attribute *bch_cache_set_internal_files[] = { }; KTYPE(bch_cache_set_internal); +static int __bch_cache_cmp(const void *l, const void *r) +{ + return *((uint16_t *)r) - *((uint16_t *)l); +} + SHOW(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -769,9 +774,6 @@ SHOW(__bch_cache) CACHE_REPLACEMENT(&ca->sb)); if (attr == &sysfs_priority_stats) { - int cmp(const void *l, const void *r) - { return *((uint16_t *) r) - *((uint16_t *) l); } - struct bucket *b; size_t n = ca->sb.nbuckets, i; size_t unused = 0, available = 0, dirty = 0, meta = 0; @@ -800,7 +802,7 @@ SHOW(__bch_cache) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); - sort(p, n, sizeof(uint16_t), cmp, NULL); + sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); while (n && !cached[n - 1]) -- cgit v1.2.3 From 91af8300d9c1d7c6b6a2fd754109e08d4798b8d8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:29 -0700 Subject: bcache: check ca->alloc_thread initialized before wake up it In bcache code, sysfs entries are created before all resources get allocated, e.g. allocation thread of a cache set. There is posibility for NULL pointer deference if a resource is accessed but which is not initialized yet. Indeed Jorg Bornschein catches one on cache set allocation thread and gets a kernel oops. The reason for this bug is, when bch_bucket_alloc() is called during cache set registration and attaching, ca->alloc_thread is not properly allocated and initialized yet, call wake_up_process() on ca->alloc_thread triggers NULL pointer deference failure. A simple and fast fix is, before waking up ca->alloc_thread, checking whether it is allocated, and only wake up ca->alloc_thread when it is not NULL. Signed-off-by: Coly Li Reported-by: Jorg Bornschein Cc: Kent Overstreet Cc: stable@vger.kernel.org Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index cacbe2dbd5c31..e1a4205caa2e1 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -406,7 +406,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); -- cgit v1.2.3 From b1e8139e48b58e3bc1234e619c750ffd1394be2f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:30 -0700 Subject: bcache: fix a comments typo in bch_alloc_sectors() Code comments in alloc.c:bch_alloc_sectors() mentions a function name find_data_bucket(), the correct function name should be pick_data_bucket() indeed. bch_alloc_sectors() is a quite important function in bcache allocation code, fixing the typo may help other people to have less confusion. Signed-off-by: Coly Li Reviewed-by: Tang Junhui Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e1a4205caa2e1..4c40870e99f5a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -601,7 +601,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, /* * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but + * second time we call pick_data_bucket(). If we allocated a bucket but * didn't use it, drop the refcount bch_bucket_alloc_set() took: */ if (KEY_PTRS(&alloc.key)) -- cgit v1.2.3 From 1dbe32ad0a82f39c6dfb7667c5da5c23b9333664 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:31 -0700 Subject: bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li Cc: Eric Wheeler Cc: Junhui Tang Reviewed-by: Michael Lyle Signed-off-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fc0a31b13ac42..a478d1ac0480c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); static int bcache_major; -static DEFINE_IDA(bcache_minor); +static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) -#define BCACHE_MINORS 16 /* partition support */ +/* limitation of partitions number on single bcache device */ +#define BCACHE_MINORS 128 +/* limitation of bcache devices number on single system */ +#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) /* Superblock */ @@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, closure_get(&c->caching); } +static inline int first_minor_to_idx(int first_minor) +{ + return (first_minor/BCACHE_MINORS); +} + +static inline int idx_to_first_minor(int idx) +{ + return (idx * BCACHE_MINORS); +} + static void bcache_device_free(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d) if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); if (d->disk) { - ida_simple_remove(&bcache_minor, d->disk->first_minor); + ida_simple_remove(&bcache_device_idx, + first_minor_to_idx(d->disk->first_minor)); put_disk(d->disk); } @@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; - int minor; + int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; @@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->full_dirty_stripes) return -ENOMEM; - minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); - if (minor < 0) - return minor; - - minor *= BCACHE_MINORS; + idx = ida_simple_get(&bcache_device_idx, 0, + BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + if (idx < 0) + return idx; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS | BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_minor, minor); + ida_simple_remove(&bcache_device_idx, idx); return -ENOMEM; } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); d->disk->major = bcache_major; - d->disk->first_minor = minor; + d->disk->first_minor = idx_to_first_minor(idx); d->disk->fops = &bcache_ops; d->disk->private_data = d; -- cgit v1.2.3 From e89d67596e202119ea846cc997f4cf75cb284490 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Fri, 13 Oct 2017 16:35:32 -0700 Subject: bcache: Remove redundant set_capacity set_capacity() has been called in bcache_device_init(), remove the redundant one. Signed-off-by: Yijing Wang Reviewed-by: Eric Wheeler Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a478d1ac0480c..72c3b2929ef02 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1142,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) if (ret) return ret; - set_capacity(dc->disk.disk, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info->ra_pages = max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); -- cgit v1.2.3 From b41c9b0266e8370033a7799f6806bfc70b7fd75f Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Fri, 13 Oct 2017 16:35:33 -0700 Subject: bcache: update bio->bi_opf bypass/writeback REQ_ flag hints Flag for bypass if the IO is for read-ahead or background, unless the read-ahead request is for metadata (eg, from gfs2). Bypass if: bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && !(bio->bi_opf & REQ_META)) Writeback if: op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) Signed-off-by: Eric Wheeler Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 8 ++++++++ drivers/md/bcache/writeback.h | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 681b4f12b05a0..9ee137e8d387b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -384,6 +384,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) op_is_write(bio_op(bio)))) goto skip; + /* + * Flag for bypass if the IO is for read-ahead or background, + * unless the read-ahead request is for metadata (eg, for gfs2). + */ + if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && + !(bio->bi_opf & REQ_META)) + goto skip; + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { pr_debug("skipping unaligned io"); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e35421d20d2eb..34bcf49d737bd 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -76,7 +76,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; + return (op_is_sync(bio->bi_opf) || + bio->bi_opf & (REQ_META|REQ_PRIO) || + in_use <= CUTOFF_WRITEBACK); } static inline void bch_writeback_queue(struct cached_dev *dc) -- cgit v1.2.3 From 238501027abf0386fa5f5dcaf589f406eb187bc3 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Fri, 13 Oct 2017 16:35:34 -0700 Subject: bcache: remove unused parameter Parameter bio is no longer used, clean it. Signed-off-by: Yijing Wang Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 9ee137e8d387b..163a17a808741 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -26,12 +26,12 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +static unsigned cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } -static bool verify(struct cached_dev *dc, struct bio *bio) +static bool verify(struct cached_dev *dc) { return dc->verify; } @@ -369,7 +369,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc, bio); + unsigned mode = cache_mode(dc); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -747,7 +747,7 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); @@ -772,7 +772,7 @@ static void cached_dev_read_done_bh(struct closure *cl) if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->iop.bio || verify(dc, &s->bio.bio)) + else if (s->iop.bio || verify(dc)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -899,7 +899,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, - cache_mode(dc, bio), + cache_mode(dc), s->iop.bypass)) { s->iop.bypass = false; s->iop.writeback = true; -- cgit v1.2.3 From 5fa89fb9a86bcc0f0b3f21ab6087a8a4170dcd2c Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:35 -0700 Subject: bcache: don't write back data if reading it failed If an IO operation fails, and we didn't successfully read data from the cache, don't writeback invalid/partial data to the backing disk. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e663ca082183e..5e65a392287d0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -179,13 +179,21 @@ static void write_dirty(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = KEY_START(&w->key); - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + closure_bio_submit(&io->bio, cl); + } continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } -- cgit v1.2.3 From 1d316e658374f700fdfff9299c70ce65d8d145e6 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:36 -0700 Subject: bcache: implement PI controller for writeback rate bcache uses a control system to attempt to keep the amount of dirty data in cache at a user-configured level, while not responding excessively to transients and variations in write rate. Previously, the system was a PD controller; but the output from it was integrated, turning the Proportional term into an Integral term, and turning the Derivative term into a crude Proportional term. Performance of the controller has been uneven in production, and it has tended to respond slowly, oscillate, and overshoot. This patch set replaces the current control system with an explicit PI controller and tuning that should be correct for most hardware. By default, it attempts to write at a rate that would retire 1/40th of the current excess blocks per second. An integral term in turn works to remove steady state errors. IMO, this yields benefits in simplicity (removing weighted average filtering, etc) and system performance. Another small change is a tunable parameter is introduced to allow the user to specify a minimum rate at which dirty blocks are retired. There is a slight difference from earlier versions of the patch in integral handling to prevent excessive negative integral windup. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 9 ++--- drivers/md/bcache/sysfs.c | 18 +++++---- drivers/md/bcache/writeback.c | 91 ++++++++++++++++++++++++------------------- 3 files changed, 66 insertions(+), 52 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2ed9bd231d84b..eb83be693d60b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,9 +265,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -362,12 +359,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 69f355b9650ca..2290bffd49228 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,8 +81,9 @@ rw_attribute(writeback_delay); rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); -rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev) sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); - var_print(writeback_rate_d_term); + var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { char rate[20]; char dirty[20]; char target[20]; char proportional[20]; - char derivative[20]; + char integral[20]; char change[20]; s64 next_io; @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev) bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); bch_hprint(change, dc->writeback_rate_change << 9); next_io = div64_s64(dc->writeback_rate.next - local_clock(), @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev) "dirty:\t\t%s\n" "target:\t\t%s\n" "proportional:\t%s\n" - "derivative:\t%s\n" + "integral:\t%s\n" "change:\t\t%s/sec\n" "next io:\t%llims\n", rate, dirty, target, proportional, - derivative, change, next_io); + integral, change, next_io); } sysfs_hprint(dirty_data, @@ -213,7 +215,7 @@ STORE(__cached_dev) dc->writeback_rate.rate, 1, INT_MAX); d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, - &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, &sysfs_dirty_data, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5e65a392287d0..cac8678da5d06 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -25,48 +25,62 @@ static void __update_writeback_rate(struct cached_dev *dc) bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), c->cached_dev_sectors); - /* PD controller */ - + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; - int64_t proportional = dirty - target; - int64_t change; - - dc->disk.sectors_dirty_last = dirty; - - /* Scale to sectors per second */ - - proportional *= dc->writeback_rate_update_seconds; - proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - - derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - (dc->writeback_rate_d_term / - dc->writeback_rate_update_seconds) ?: 1, 0); - - derivative *= dc->writeback_rate_d_term; - derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - - change = proportional + derivative; + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled, new_rate; + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + NSEC_PER_MSEC)) - change = 0; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - dc->writeback_rate.rate = - clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); + new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_MSEC); - dc->writeback_rate_proportional = proportional; - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; + dc->writeback_rate.rate = new_rate; dc->writeback_rate_target = target; } @@ -499,8 +513,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - - d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -514,10 +526,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 1; dc->writeback_rate_update_seconds = 5; - dc->writeback_rate_d_term = 30; - dc->writeback_rate_p_term_inverse = 6000; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } -- cgit v1.2.3 From ae82ddbfeb359fcffa97be5fb5bcd59165f2864f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:37 -0700 Subject: bcache: smooth writeback rate control This works in conjunction with the new PI controller. Currently, in real-world workloads, the rate controller attempts to write back 1 sector per second. In practice, these minimum-rate writebacks are between 4k and 60k in test scenarios, since bcache aggregates and attempts to do contiguous writes and because filesystems on top of bcachefs typically write 4k or more. Previously, bcache used to guarantee to write at least once per second. This means that the actual writeback rate would exceed the configured amount by a factor of 8-120 or more. This patch adjusts to be willing to sleep up to 2.5 seconds, and to target writing 4k/second. On the smallest writes, it will sleep 1 second like before, but many times it will sleep longer and load the backing device less. This keeps the loading on the cache and backing device related to writeback more consistent when writing back at low rates. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 10 ++++++++-- drivers/md/bcache/writeback.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 176d3c2ef5f5c..4dbe37e828778 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) d->next += div_u64(done * NSEC_PER_SEC, d->rate); - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next)) + d->next = now + NSEC_PER_SEC * 5 / 2; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index cac8678da5d06..8deb721c355e9 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -526,7 +526,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 1; + dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = 5; dc->writeback_rate_p_term_inverse = 40; -- cgit v1.2.3 From e41166c5c44e30dbd620f7c77a27efe5d5cc551a Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:38 -0700 Subject: bcache: writeback rate shouldn't artifically clamp The previous code artificially limited writeback rate to 1000000 blocks/second (NSEC_PER_MSEC), which is a rate that can be met on fast hardware. The rate limiting code works fine (though with decreased precision) up to 3 orders of magnitude faster, so use NSEC_PER_SEC. Additionally, ensure that uint32_t is used as a type for rate throughout the rate management so that type checking/clamp_t can work properly. bch_next_delay should be rewritten for increased precision and better handling of high rates and long sleep periods, but this is adequate for now. Signed-off-by: Michael Lyle Reported-by: Coly Li Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/util.h | 4 ++-- drivers/md/bcache/writeback.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index eb83be693d60b..d77c4829c4972 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -361,7 +361,7 @@ struct cached_dev { int64_t writeback_rate_proportional; int64_t writeback_rate_integral; int64_t writeback_rate_integral_scaled; - int64_t writeback_rate_change; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_i_term_inverse; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6e..8f509290bb02e 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -441,10 +441,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8deb721c355e9..897d28050656b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -52,7 +52,8 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t error = dirty - target; int64_t proportional_scaled = div_s64(error, dc->writeback_rate_p_term_inverse); - int64_t integral_scaled, new_rate; + int64_t integral_scaled; + uint32_t new_rate; if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), @@ -74,8 +75,8 @@ static void __update_writeback_rate(struct cached_dev *dc) integral_scaled = div_s64(dc->writeback_rate_integral, dc->writeback_rate_i_term_inverse); - new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), - dc->writeback_rate_minimum, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; -- cgit v1.2.3 From a8500fc816b19795756d27c762daa5e19f5e1b6f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:39 -0700 Subject: bcache: rearrange writeback main thread ratelimit The time spent searching for things to write back "counts" for the actual rate achieved, so don't flush the accumulated rate with each chunk. This will maintain better fidelity to user-commanded rates, but it may slightly increase the burstiness of writeback. The writeback lock needs improvement to help mitigate this. Signed-off-by: Michael Lyle Reviewed-by: Kent Overstreet Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 897d28050656b..9b770b13bdf62 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -440,6 +440,8 @@ static int bch_writeback_thread(void *arg) struct cached_dev *dc = arg; bool searched_full_index; + bch_ratelimit_reset(&dc->writeback_rate); + while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || @@ -467,7 +469,6 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); - bch_ratelimit_reset(&dc->writeback_rate); read_dirty(dc); if (searched_full_index) { @@ -477,6 +478,8 @@ static int bch_writeback_thread(void *arg) !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); } } -- cgit v1.2.3 From 6446c684f9418d0175c9c3e5134e7744fe79181a Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 13 Oct 2017 16:35:40 -0700 Subject: bcache: safeguard a dangerous addressing in closure_queue The use of the union reduces the size of closure struct by taking advantage of the current size of its members. The offset of func in work_struct equals the size of the first three members, so that work.work_func will just reference the forth member - fn. This is smart but dangerous. It can be broken if work_struct or the other structs get changed, and can be a bit difficult to debug. Signed-off-by: Liang Chen Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 295b7e43f92c2..00fb314cce57c 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -251,6 +251,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, static inline void closure_queue(struct closure *cl) { struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); -- cgit v1.2.3 From 77c77a98f8a4dd4d33810e25a8780f65d6fa00e9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Oct 2017 16:35:41 -0700 Subject: bcache: Add Michael Lyle to MAINTAINERS Signed-off-by: Kent Overstreet Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6671f375f7fcd..4654cf5004990 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2562,6 +2562,7 @@ S: Maintained F: drivers/net/hamradio/baycom* BCACHE (BLOCK LAYER CACHE) +M: Michael Lyle M: Kent Overstreet L: linux-bcache@vger.kernel.org W: http://bcache.evilpiepirate.org -- cgit v1.2.3 From 52b69ff54aa0d163253dbd7ff80aaecf179dfbe3 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:42 -0700 Subject: bcache: MAINTAINERS: set bcache to MAINTAINED Also add URL for IRC channel. Signed-off-by: Michael Lyle Signed-off-by: Jens Axboe --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4654cf5004990..c0bbcf1c81884 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2566,7 +2566,8 @@ M: Michael Lyle M: Kent Overstreet L: linux-bcache@vger.kernel.org W: http://bcache.evilpiepirate.org -S: Orphan +C: irc://irc.oftc.net/bcache +S: Maintained F: drivers/md/bcache/ BDISP ST MEDIA DRIVER -- cgit v1.2.3 From 9ce762e85bc95fd7aa1fb5f8cfc38ce5a228fc95 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Mon, 16 Oct 2017 10:34:47 -0700 Subject: bcache: writeback rate clamping: make 32 bit safe Sorry this got through to linux-block, was detected by the kbuilds test robot. NSEC_PER_SEC is a long constant; 2.5 * 10^9 doesn't fit in a signed long constant. Fixes: e41166c5c44e ("bcache: writeback rate shouldn't artifically clamp") Reviewed-by: Coly Li Signed-off-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 4dbe37e828778..e548b8b513227 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -238,8 +238,8 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) * don't let us sleep more than 2.5 seconds (so we can notice/respond * if the control system tells us to speed up!). */ - if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next)) - d->next = now + NSEC_PER_SEC * 5 / 2; + if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) + d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; -- cgit v1.2.3 From 519c8e9ffd86143fedd84cf833a09f36b47d0f5c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Oct 2017 11:01:00 -0700 Subject: block: fix Sphinx kernel-doc warning Sphinx treats symbols that end with '_' as a kind of special documentation indicator, so fix that by adding an ending '*' to it. ../block/bio.c:404: ERROR: Unknown target name: "gfp". Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index bf0dbe8f78f8e..ae9ad34e6a71f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs) /** * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator + * @gfp_mask: the GFP_* mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * -- cgit v1.2.3 From 30c516d750396c5f3ec9cb04c9e025c25e91495e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 17 Oct 2017 12:11:46 +0000 Subject: nullb: fix error return code in null_init() Fix to return error code -ENOMEM from the null_alloc_dev() error handling case instead of 0, as done elsewhere in this function. Fixes: 2984c8684f96 ("nullb: factor disk parameters") Signed-off-by: Wei Yongjun Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index bf2c8ca3242aa..50c83c4b2ea02 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1991,8 +1991,10 @@ static int __init null_init(void) for (i = 0; i < nr_devices; i++) { dev = null_alloc_dev(); - if (!dev) + if (!dev) { + ret = -ENOMEM; goto err_dev; + } ret = null_add_dev(dev); if (ret) { null_free_dev(dev); -- cgit v1.2.3 From 8cf466602028196b939255f1eb4e9817efd1db6d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 11 Oct 2017 10:39:15 -0700 Subject: kyber: fix hang on domain token wait queue When we're getting a domain token, if we fail to get a token on our first attempt, we put the current hardware queue on a wait queue and then try again just in case a token was freed after our initial attempt but before we got on the wait queue. If this second attempt succeeds, we currently leave the hardware queue on the wait queue. Usually this is okay; we'll just run the hardware queue one extra time when another token is freed. However, if the hardware queue doesn't have any other requests waiting, then when it it gets the extra wakeup, it won't have anything to free and therefore won't wake up any other hardware queues. If tokens are limited, then we won't make forward progress and the device will hang. Reported-by: Bin Zha Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f58cab82105ba..db5bfc6342d34 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, /* * Try again in case a token was freed before we got on the wait - * queue. + * queue. The waker may have already removed the entry from the + * wait queue, but list_del_init() is okay with that. */ nr = __sbitmap_queue_get(domain_tokens); + if (nr >= 0) { + unsigned long flags; + + spin_lock_irqsave(&ws->wait.lock, flags); + list_del_init(&wait->entry); + spin_unlock_irqrestore(&ws->wait.lock, flags); + } } return nr; } -- cgit v1.2.3 From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:06 +0300 Subject: block: introduce blk_mq_tagset_iter Iterator helper to apply a function on all the tags in a given tagset. export it as it will be used outside the block layer later on. Reviewed-by: Bart Van Assche Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- block/blk-mq-tag.c | 16 +++++++++++----- include/linux/blk-mq.h | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 6714507aa6c75..bce1c76fc768d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)) +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (fn)(void *, struct request *)) { int i, j, ret = 0; - if (WARN_ON_ONCE(!reinit_request)) + if (WARN_ON_ONCE(!fn)) goto out; for (i = 0; i < set->nr_hw_queues; i++) { @@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, if (!tags->static_rqs[j]) continue; - ret = reinit_request(set->driver_data, - tags->static_rqs[j]); + ret = fn(data, tags->static_rqs[j]); if (ret) goto out; } @@ -326,6 +325,13 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, out: return ret; } +EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); + +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)) +{ + return blk_mq_tagset_iter(set, set->driver_data, reinit_request); +} EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f5..6bc29f73a9aa4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (reinit_request)(void *, struct request *)); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, int (reinit_request)(void *, struct request *)); -- cgit v1.2.3 From 31b8446079757575e576b0516f0e4c0fcdfbd3dd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:07 +0300 Subject: nvme: introduce nvme_reinit_tagset Move blk_mq_reinit_tagset from blk-mq to nvme core as the only user of it. Current transports that use it (rdma, fc) simply implement .reinit_request op. This patch does not change any functionality. Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 ++++++++++ drivers/nvme/host/fc.c | 3 ++- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/rdma.c | 7 +++---- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 573cc3b59bfad..c95155696741b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2934,6 +2934,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); +int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) +{ + if (!ctrl->ops->reinit_request) + return 0; + + return blk_mq_tagset_iter(set, set->driver_data, + ctrl->ops->reinit_request); +} +EXPORT_SYMBOL_GPL(nvme_reinit_tagset); + int __init nvme_core_init(void) { int result; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b8e0822127c1a..372a0685b12a6 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2393,7 +2393,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request); + ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); if (ret) goto out_free_io_queues; @@ -2753,6 +2753,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_del_nvme_ctrl, .get_address = nvmf_get_address, + .reinit_request = nvme_fc_reinit_request, }; static void diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index df787f39f4c1e..cb9d93048f3db 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -237,6 +237,7 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); int (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); + int (*reinit_request)(void *data, struct request *rq); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -311,6 +312,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); +int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 92a03ff5fb4d4..039f38cb6987c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -774,8 +774,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_tagset; } } else { - error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, - nvme_rdma_reinit_request); + error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); if (error) goto out_free_queue; } @@ -855,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_tag_set; } } else { - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); + ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); if (ret) goto out_free_io_queues; @@ -1845,6 +1843,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_del_ctrl, .get_address = nvmf_get_address, + .reinit_request = nvme_rdma_reinit_request, }; static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, -- cgit v1.2.3 From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:08 +0300 Subject: block: remove blk_mq_reinit_tagset No callers left. Reviewed-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- block/blk-mq-tag.c | 7 ------- include/linux/blk-mq.h | 2 -- 2 files changed, 9 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index bce1c76fc768d..c81b40ecd3f11 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -327,13 +327,6 @@ out: } EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)) -{ - return blk_mq_tagset_iter(set, set->driver_data, reinit_request); -} -EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); - void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6bc29f73a9aa4..cfd64e500d82c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, int (reinit_request)(void *, struct request *)); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3 From 60070c78ef7b624680ffdde30bca55e515504585 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:06 +0300 Subject: nvme-rdma: pass tagset to directly nvme_rdma_free_tagset Instead of flagging admin/io. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 039f38cb6987c..93c8578a2ddc2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -670,11 +670,10 @@ out_free_queues: return ret; } -static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) +static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, + struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - struct blk_mq_tag_set *set = admin ? - &ctrl->admin_tag_set : &ctrl->tag_set; blk_mq_free_tag_set(set); nvme_rdma_dev_put(ctrl->device); @@ -744,7 +743,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } nvme_rdma_free_queue(&ctrl->queues[0]); } @@ -818,7 +817,7 @@ out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -830,7 +829,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_io_queues(ctrl); if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); } nvme_rdma_free_io_queues(ctrl); } @@ -873,7 +872,7 @@ out_cleanup_connect_q: blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; -- cgit v1.2.3 From d8bfceebc41762f7dde960996a24180f9a67cd51 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:07 +0300 Subject: nvme-rdma: fix wrong logging message Not necessarily address resolution failed. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 93c8578a2ddc2..02b3388af308c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -544,7 +544,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, ret = nvme_rdma_wait_for_cm(queue); if (ret) { dev_info(ctrl->ctrl.device, - "rdma_resolve_addr wait failed (%d).\n", ret); + "rdma connection establishment failed (%d)\n", ret); goto out_destroy_cm_id; } -- cgit v1.2.3 From 0c5b43b9c1fafc5af37915dde6670aaf1224e8cc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:08 +0300 Subject: nvme-rdma: move assignment to declaration No need for the extra line for trivial assignments. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 02b3388af308c..221ace23aaa5e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -434,11 +434,9 @@ out_err: static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev; - struct ib_device *ibdev; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; - dev = queue->device; - ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); ib_free_cq(queue->ib_cq); -- cgit v1.2.3 From 0fc176dfdafc78e66bd74f54e487ca1fc59d01bf Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:09 +0300 Subject: nvme-rdma: Check that reinit_request got a proper mr Warn if req->mr is NULL as it should never happen. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 221ace23aaa5e..7e61e6447d2cc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -274,6 +274,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; + if (WARN_ON_ONCE(!req->mr)) + return 0; + ib_dereg_mr(req->mr); req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, -- cgit v1.2.3 From 5e1fe61d4170b1f498e20de92c7ce4cd5e40c3c5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:11 +0300 Subject: nvme-rdma: teardown admin/io queues once on error recovery Relying on the queue state while tearing down on every reconnect attempt is not a good design. We should do it once in err_work and simply try to establish the queues for each reconnect attempt. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7e61e6447d2cc..95837c5317b41 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -925,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->ctrl.queue_count > 1) - nvme_rdma_destroy_io_queues(ctrl, false); - - nvme_rdma_destroy_admin_queue(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto requeue; @@ -936,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) - goto requeue; + goto destroy_admin; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -946,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; } - ctrl->ctrl.nr_reconnects = 0; - nvme_start_ctrl(&ctrl->ctrl); - dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", + ctrl->ctrl.nr_reconnects); + + ctrl->ctrl.nr_reconnects = 0; return; +destroy_admin: + nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -969,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - - /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, false); + } + + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl, false); /* * queues are not a live anymore, so restart the queues to fail fast -- cgit v1.2.3 From 60a518863368c7e124cd8411c2481d0938f53e08 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:10 +0300 Subject: nvme-rdma: Don't local invalidate if the queue is not live No chance for the local invalidate to succeed if the queue-pair is in error state. Most likely the target will do a remote invalidation of our mr so not a big loss on the test_bit. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 95837c5317b41..a58dd0c77a295 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1052,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (!blk_rq_bytes(rq)) return; - if (req->mr->need_inval) { + if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { res = nvme_rdma_inv_rkey(queue, req); if (unlikely(res < 0)) { dev_err(ctrl->ctrl.device, -- cgit v1.2.3 From 5013e98b5e8db50144e8f1ca5a96aed95d4d48a0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:12 +0300 Subject: nvme-rdma: change queue flag semantics DELETING -> ALLOCATED Instead of marking we are deleting, mark we are allocated and check that instead. This makes the logic symmetrical to connected mark check. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a58dd0c77a295..ff67375982fe8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -79,8 +79,8 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_LIVE = 0, - NVME_RDMA_Q_DELETING = 1, + NVME_RDMA_Q_ALLOCATED = 0, + NVME_RDMA_Q_LIVE = 1, }; struct nvme_rdma_queue { @@ -549,7 +549,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, goto out_destroy_cm_id; } - clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); + set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); return 0; @@ -569,7 +569,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) { - if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) + if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; nvme_rdma_destroy_queue_ib(queue); -- cgit v1.2.3 From 0ad0bfa29822904b9186a1c04dbfb3014bc5dc8a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:49:51 +0300 Subject: nvme-rdma: stop controller reset if the controller is deleting If the controller is deleting (in case the user decided to delete it), we have no point to continue reset sequence. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ff67375982fe8..34d22ff42afba 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1818,7 +1818,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } nvme_start_ctrl(&ctrl->ctrl); -- cgit v1.2.3 From 16772ae6d9221b065607335a9b279375d8d3e2fd Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 18 Oct 2017 22:56:09 +0900 Subject: nvme-pci: fix typos in comments fixed comment typos in adapter_alloc_cq() and adapter_alloc_sq(). 'the the' duplications are replaced with 'that the'. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cb73bc8cad3bd..bafdc2ab5be3c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -930,7 +930,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; /* - * Note: we (ab)use the fact the the prp fields survive if no data + * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request. */ memset(&c, 0, sizeof(c)); @@ -951,7 +951,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, int flags = NVME_QUEUE_PHYS_CONTIG; /* - * Note: we (ab)use the fact the the prp fields survive if no data + * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request. */ memset(&c, 0, sizeof(c)); -- cgit v1.2.3 From 94f29d4f7808e3a495fda8c5d337fc4ac4037ce7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:24 +0000 Subject: nvme-rdma: Add BLK_MQ_F_NO_SCHED flag to admin tag set Since commit b86dd81 "block: get rid of blk-mq default scheduler choice Kconfig entries", when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler. This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34d22ff42afba..a5577e06a6204 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -699,6 +699,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = 1; set->timeout = ADMIN_TIMEOUT; + set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); -- cgit v1.2.3 From 5a22e2bf44fb311e5a6bfec6ed5680199d6e162a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:25 +0000 Subject: nvme-fc: Add BLK_MQ_F_NO_SCHED flag to admin tag set Since commit b86dd81 "block: get rid of blk-mq default scheduler choice Kconfig entries", when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler. This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 372a0685b12a6..0d0e898912add 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2853,6 +2853,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (ret) -- cgit v1.2.3 From 86f36b9c6c6cda28c86ead09323f55018e5e64df Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:26 +0000 Subject: nvme-loop: Add BLK_MQ_F_NO_SCHED flag to admin tag set This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 92628c4329262..c56354e1e4c61 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -365,6 +365,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ctrl->queues[0].ctrl = ctrl; error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); -- cgit v1.2.3 From ba2dec35e431e62e11116b8036dc372493169b39 Mon Sep 17 00:00:00 2001 From: Roy Shterman Date: Wed, 18 Oct 2017 13:46:07 +0300 Subject: nvmet: Change max_nsid in subsystem due to ns_disable if needed In case we disable namespaces which has the nsid like subsystem max_nsid we need to search for the next largest nsid in this subsystem. If the subsystem don't has more namespaces we set it to 0, else we take nsid from the last namespace in namespaces list because the list is sorted while inserting. Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Roy Shterman Reviewed-by: Johannes Thumshirn [hch: slight refactor] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1b208beeef509..da088293eafc1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) return 0; } +static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) +{ + struct nvmet_ns *ns; + + if (list_empty(&subsys->namespaces)) + return 0; + + ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); + return ns->nsid; +} + static u32 nvmet_async_event_result(struct nvmet_async_event *aen) { return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); @@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns) ns->enabled = false; list_del_rcu(&ns->dev_link); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); mutex_unlock(&subsys->lock); /* -- cgit v1.2.3 From 9843f685ae365d0c628e6a0d929772bca7274311 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:10:01 +0200 Subject: nvme: use ida_simple_{get,remove} for the controller instance Switch to the ida_simple_* helpers instead of opencoding them. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c95155696741b..7fae42d595d5d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -74,6 +74,8 @@ EXPORT_SYMBOL_GPL(nvme_wq); static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); +static DEFINE_IDA(nvme_instance_ida); + static struct class *nvme_class; static __le32 nvme_get_log_dw10(u8 lid, size_t size) @@ -2696,35 +2698,6 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_queue_async_events); -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_ctrl *ctrl) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - ctrl->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_ctrl *ctrl) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, ctrl->instance); - spin_unlock(&dev_list_lock); -} - void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_keep_alive(ctrl); @@ -2762,7 +2735,7 @@ static void nvme_free_ctrl(struct kref *kref) struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); put_device(ctrl->device); - nvme_release_instance(ctrl); + ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); @@ -2796,9 +2769,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); - ret = nvme_set_instance(ctrl); - if (ret) + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); + if (ret < 0) goto out; + ctrl->instance = ret; ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, MKDEV(nvme_char_major, ctrl->instance), @@ -2825,7 +2799,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_release_instance: - nvme_release_instance(ctrl); + ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: return ret; } -- cgit v1.2.3 From a7a7cbe353a52665b8463e1822ce6ba46b0609d6 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 16 Oct 2017 18:24:20 -0700 Subject: nvme-pci: add SGL support This adds SGL support for NVMe PCIe driver, based on an earlier patch from Rajiv Shanmugam Madeswaran . This patch refactors the original code and adds new module parameter sgl_threshold to determine whether to use SGL or PRP for IOs. The usage of SGLs is controlled by the sgl_threshold module parameter, which allows to conditionally use SGLs if average request segment size (avg_seg_size) is greater than sgl_threshold. In the original patch, the decision of using SGLs was dependent only on the IO size, with the new approach we consider not only IO size but also the number of physical segments present in the IO. We calculate avg_seg_size based on request payload bytes and number of physical segments present in the request. For e.g.:- 1. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 8k avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold. 2. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 64k avg_seg_size = 32K use sgl if avg_seg_size >= sgl_threshold. 3. blk_rq_nr_phys_segments = 16 blk_rq_payload_bytes = 64k avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 214 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 187 insertions(+), 27 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bafdc2ab5be3c..11e6fd9d0ba49 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -45,6 +45,8 @@ */ #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) +#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -57,6 +59,12 @@ module_param(max_host_mem_size_mb, uint, 0444); MODULE_PARM_DESC(max_host_mem_size_mb, "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); +static unsigned int sgl_threshold = SZ_32K; +module_param(sgl_threshold, uint, 0644); +MODULE_PARM_DESC(sgl_threshold, + "Use SGLs when average request segment size is larger or equal to " + "this size. Use 0 to disable SGLs."); + static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -178,6 +186,7 @@ struct nvme_queue { struct nvme_iod { struct nvme_request req; struct nvme_queue *nvmeq; + bool use_sgl; int aborted; int npages; /* In the PRP list. 0 means small pool in use */ int nents; /* Used in scatterlist */ @@ -331,17 +340,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg) +/* + * Calculates the number of pages needed for the SGL segments. For example a 4k + * page can accommodate 256 SGL descriptors. + */ +static int nvme_pci_npages_sgl(unsigned int num_seg) +{ + return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); +} + +static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg, bool use_sgl) { - return sizeof(__le64 *) * nvme_npages(size, dev) + - sizeof(struct scatterlist) * nseg; + size_t alloc_size; + + if (use_sgl) + alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); + else + alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); + + return alloc_size + sizeof(struct scatterlist) * nseg; } -static unsigned int nvme_cmd_size(struct nvme_dev *dev) +static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl) { - return sizeof(struct nvme_iod) + - nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); + unsigned int alloc_size = nvme_pci_iod_alloc_size(dev, + NVME_INT_BYTES(dev), NVME_INT_PAGES, + use_sgl); + + return sizeof(struct nvme_iod) + alloc_size; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -425,10 +452,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static __le64 **iod_list(struct request *req) +static void **nvme_pci_iod_list(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); + return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); } static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) @@ -438,7 +465,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) unsigned int size = blk_rq_payload_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { - iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, + iod->use_sgl); + + iod->sg = kmalloc(alloc_size, GFP_ATOMIC); if (!iod->sg) return BLK_STS_RESOURCE; } else { @@ -456,18 +486,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) static void nvme_free_iod(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / 8 - 1; + const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + dma_addr_t dma_addr = iod->first_dma, next_dma_addr; + int i; - __le64 **list = iod_list(req); - dma_addr_t prp_dma = iod->first_dma; if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + dma_addr); + for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = list[i]; - dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); - prp_dma = next_prp_dma; + void *addr = nvme_pci_iod_list(req)[i]; + + if (iod->use_sgl) { + struct nvme_sgl_desc *sg_list = addr; + + next_dma_addr = + le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); + } else { + __le64 *prp_list = addr; + + next_dma_addr = le64_to_cpu(prp_list[last_prp]); + } + + dma_pool_free(dev->prp_page_pool, addr, dma_addr); + dma_addr = next_dma_addr; } if (iod->sg != iod->inline_sg) @@ -555,7 +598,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) } } -static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; @@ -566,14 +610,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; - __le64 **list = iod_list(req); + void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; + iod->use_sgl = false; + length -= (page_size - offset); if (length <= 0) { iod->first_dma = 0; - return BLK_STS_OK; + goto done; } dma_len -= (page_size - offset); @@ -587,7 +633,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) if (length <= page_size) { iod->first_dma = dma_addr; - return BLK_STS_OK; + goto done; } nprps = DIV_ROUND_UP(length, page_size); @@ -634,6 +680,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) dma_len = sg_dma_len(sg); } +done: + cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + return BLK_STS_OK; bad_sgl: @@ -643,6 +693,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) return BLK_STS_IOERR; } +static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, + struct scatterlist *sg) +{ + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; +} + +static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, + dma_addr_t dma_addr, int entries) +{ + sge->addr = cpu_to_le64(dma_addr); + if (entries < SGES_PER_PAGE) { + sge->length = cpu_to_le32(entries * sizeof(*sge)); + sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; + } else { + sge->length = cpu_to_le32(PAGE_SIZE); + sge->type = NVME_SGL_FMT_SEG_DESC << 4; + } +} + +static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int length = blk_rq_payload_bytes(req); + struct dma_pool *pool; + struct nvme_sgl_desc *sg_list; + struct scatterlist *sg = iod->sg; + int entries = iod->nents, i = 0; + dma_addr_t sgl_dma; + + iod->use_sgl = true; + + /* setting the transfer type as SGL */ + cmd->flags = NVME_CMD_SGL_METABUF; + + if (length == sg_dma_len(sg)) { + nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + return BLK_STS_OK; + } + + if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) { + iod->npages = -1; + return BLK_STS_RESOURCE; + } + + nvme_pci_iod_list(req)[0] = sg_list; + iod->first_dma = sgl_dma; + + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); + + do { + if (i == SGES_PER_PAGE) { + struct nvme_sgl_desc *old_sg_desc = sg_list; + struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) + return BLK_STS_RESOURCE; + + i = 0; + nvme_pci_iod_list(req)[iod->npages++] = sg_list; + sg_list[i++] = *link; + nvme_pci_sgl_set_seg(link, sgl_dma, entries); + } + + nvme_pci_sgl_set_data(&sg_list[i++], sg); + + length -= sg_dma_len(sg); + sg = sg_next(sg); + entries--; + } while (length > 0); + + WARN_ON(entries > 0); + return BLK_STS_OK; +} + +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int avg_seg_size; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), + blk_rq_nr_phys_segments(req)); + + if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) + return false; + if (!iod->nvmeq->qid) + return false; + if (!sgl_threshold || avg_seg_size < sgl_threshold) + return false; + return true; +} + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -662,7 +816,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - ret = nvme_setup_prps(dev, req); + if (nvme_pci_use_sgls(dev, req)) + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + else + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + if (ret != BLK_STS_OK) goto out_unmap; @@ -682,8 +840,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_unmap; } - cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); return BLK_STS_OK; @@ -1379,7 +1535,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); - dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; @@ -1906,7 +2062,11 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); + if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) { + dev->tagset.cmd_size = max(dev->tagset.cmd_size, + nvme_pci_cmd_size(dev, true)); + } dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; -- cgit v1.2.3 From 0a02e39fd1eb2ceb3dd0dc765cb2de4d09697a14 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 19 Oct 2017 16:11:38 -0700 Subject: nvme-fc: correct io termination handling The io completion handling for i/o's that are failing due to to a transport error or association termination had issues, causing io failures (DNR set so retries didn't kick in) or long stalls. Change the io completion handler for the following items: When an io has been completed due to a transport abort (based on an exchange error) or when marked as aborted as part of an association termination (FCOP_FLAGS_TERMIO), set the NVME completion status to NVME_SC_ABORTED. By default, do not set DNR on the status so that a retry can be attempted after association recreate. In cases where an io is failed (non-successful nvme status including aborted), if the controller is being deleted (blk_queue_dying) or the io was part of the ios used for association creation (ctrl state is NEW or RECONNECTING), then additionally set the DNR bit so the io will not be retried. If the failed io was part of association creation, the failure will tear down the partially completioned association and typically restart a new reconnect attempt (another create association later). Rearranged code flow to remove a largely unneeded local variable. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0d0e898912add..e1a7e5cdfcf4e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1387,7 +1387,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) struct nvme_command *sqe = &op->cmd_iu.sqe; __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); union nvme_result result; - bool complete_rq, terminate_assoc = true; + bool terminate_assoc = true; /* * WARNING: @@ -1429,8 +1429,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, sizeof(op->rsp_iu), DMA_FROM_DEVICE); - if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) - status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); + if (atomic_read(&op->state) == FCPOP_STATE_ABORTED || + op->flags & FCOP_FLAGS_TERMIO) + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); else if (freq->status) status = cpu_to_le16(NVME_SC_INTERNAL << 1); @@ -1494,23 +1495,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) done: if (op->flags & FCOP_FLAGS_AEN) { nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); - complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); + __nvme_fc_fcpop_chk_teardowns(ctrl, op); atomic_set(&op->state, FCPOP_STATE_IDLE); op->flags = FCOP_FLAGS_AEN; /* clear other flags */ nvme_fc_ctrl_put(ctrl); goto check_error; } - complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); - if (!complete_rq) { - if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - if (blk_queue_dying(rq->q)) - status |= cpu_to_le16(NVME_SC_DNR << 1); - } - nvme_end_request(rq, status, result); - } else + /* + * Force failures of commands if we're killing the controller + * or have an error on a command used to create an new association + */ + if (status && + (blk_queue_dying(rq->q) || + ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_RECONNECTING)) + status |= cpu_to_le16(NVME_SC_DNR << 1); + + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) __nvme_fc_final_op_cleanup(rq); + else + nvme_end_request(rq, status, result); check_error: if (terminate_assoc) -- cgit v1.2.3 From 134aedc9c157d49069e9a98636b0a917678586ee Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 19 Oct 2017 16:11:39 -0700 Subject: nvme-fc: correct io timeout behavior The transport io timeout behavior wasn't quite correct. It ignored that the io error handler is supposed to be synchronous so it possibly allowed the blk request to be restarted while the io associated was still aborting. Timeouts on reserved commands, those used for association create, were never timing out thus they hung out forever. To correct: If an io is times out while a remoteport is not connected, just restart the io timer. The lack of connectivity will simultaneously be resetting the controller, so the reset path will abort and terminate the io. If an io is times out while it was marked for transport abort, just reset the io timer. The abort process is underway and will complete the io. Otherwise, if an io times out, abort the io. If the abort was unsuccessful (unlikely) give up and return not handled. If the abort was successful, as the abort process is underway it will terminate the io, so rather than synchronously waiting, just restart the io timer. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e1a7e5cdfcf4e..c6c903f1b172a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1903,13 +1903,14 @@ nvme_fc_timeout(struct request *rq, bool reserved) struct nvme_fc_ctrl *ctrl = op->ctrl; int ret; - if (reserved) + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + atomic_read(&op->state) == FCPOP_STATE_ABORTED) return BLK_EH_RESET_TIMER; ret = __nvme_fc_abort_op(ctrl, op); if (ret) - /* io wasn't active to abort consider it done */ - return BLK_EH_HANDLED; + /* io wasn't active to abort */ + return BLK_EH_NOT_HANDLED; /* * we can't individually ABTS an io without affecting the queue, @@ -1920,7 +1921,12 @@ nvme_fc_timeout(struct request *rq, bool reserved) */ nvme_fc_error_recovery(ctrl, "io timeout error"); - return BLK_EH_HANDLED; + /* + * the io abort has been initiated. Have the reset timer + * restarted and the abort completion will complete the io + * shortly. Avoids a synchronous wait while the abort finishes. + */ + return BLK_EH_RESET_TIMER; } static int -- cgit v1.2.3 From f87c89ad93c95857c4d4b19ca580e27994254b7f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 23 Oct 2017 12:59:27 +0300 Subject: nvme-rdma: align nvme_rdma_device structure Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a5577e06a6204..ff4635d317b5b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -50,8 +50,8 @@ (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) struct nvme_rdma_device { - struct ib_device *dev; - struct ib_pd *pd; + struct ib_device *dev; + struct ib_pd *pd; struct kref ref; struct list_head entry; }; -- cgit v1.2.3 From e62a538da2e7401bad5293d210ce00583ac12f39 Mon Sep 17 00:00:00 2001 From: Nitzan Carmi Date: Sun, 22 Oct 2017 09:37:04 +0000 Subject: nvme-rdma: Add debug message when reaches timeout Signed-off-by: Nitzan Carmi Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ff4635d317b5b..4552744eff454 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1578,6 +1578,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + dev_warn(req->queue->ctrl->ctrl.device, + "I/O %d QID %d timeout, reset controller\n", + rq->tag, nvme_rdma_queue_idx(req->queue)); + /* queue error recovery */ nvme_rdma_error_recovery(req->queue->ctrl); -- cgit v1.2.3 From 75bc5f06617fe1bc9a79ba9e3baccdcae3743404 Mon Sep 17 00:00:00 2001 From: Javier González Date: Tue, 24 Oct 2017 15:56:13 +0200 Subject: lightnvm: pblk: remove leftover testing function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous patch inadvertently left an unused test function in the header, kill it. Fixes: 8bd400204bd5 ("lightnvm: pblk: cleanup unused and static functions") Signed-off-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 6b64288de6f76..90961033a79fc 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -860,11 +860,6 @@ int pblk_rl_is_limit(struct pblk_rl *rl); int pblk_sysfs_init(struct gendisk *tdisk); void pblk_sysfs_exit(struct gendisk *tdisk); -static inline void test(size_t a) -{ - a += 1; -} - static inline void *pblk_malloc(size_t size, int type, gfp_t flags) { if (type == PBLK_KMALLOC_META) -- cgit v1.2.3 From 351499a172c0c5fc52d65ee2c62b344f369ea02a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 24 Oct 2017 18:44:57 -0600 Subject: block: Invalidate cache on discard v2 It is reasonable drop page cache on discard, otherwise that pages may be written by writeback second later, so thin provision devices will not be happy. This seems to be a security leak in case of secure discard case. Also add check for queue_discard flag on early stage. Signed-off-by: Dmitry Monakhov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0de02ee67eed8..c0fc32bd8ed17 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, { uint64_t range[2]; uint64_t start, len; + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping = bdev->bd_inode->i_mapping; + if (!(mode & FMODE_WRITE)) return -EBADF; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (start + len > i_size_read(bdev->bd_inode)) return -EINVAL; - return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); + truncate_inode_pages_range(mapping, start, start + len); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, flags); } static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, -- cgit v1.2.3 From bb749b31c25e9b11f8f974baac8d507298ffbb70 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 18 Oct 2017 14:38:38 +0200 Subject: block: move CAP_SYS_ADMIN check in blkdev_roset() Check for CAP_SYS_ADMIN before calling into the driver, similar to blkdev_flushbuf(). This is safer and can spare a check in the driver. (Currently BLKROSET is overridden by md and rbd, rbd is missing the check. md has the check, but it covers a lot more than BLKROSET.) Acked-by: Al Viro Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index c0fc32bd8ed17..1668506d8ed80 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -443,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, { int ret, n; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); if (!is_unrecognized_ioctl(ret)) return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; set_device_ro(bdev, n); -- cgit v1.2.3 From 425a4dba7953e35ffd096771973add6d2f40d2ed Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Oct 2017 15:59:09 +0200 Subject: block: factor out __blkdev_issue_zero_pages() blkdev_issue_zeroout() will use this in !BLKDEV_ZERO_NOFALLBACK case. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-lib.c | 63 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 62240f8832ca6..9d2ab8bba52ac 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -274,6 +274,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) return min(pages, (sector_t)BIO_MAX_PAGES); } +static int __blkdev_issue_zero_pages(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + int bi_size = 0; + unsigned int sz; + + if (!q) + return -ENXIO; + + while (nr_sects != 0) { + bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), + gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < sz) + break; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -304,9 +338,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, unsigned flags) { int ret; - int bi_size = 0; - struct bio *bio = *biop; - unsigned int sz; sector_t bs_mask; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; @@ -316,30 +347,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags); if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) - goto out; - - ret = 0; - while (nr_sects != 0) { - bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), - gfp_mask); - bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - while (nr_sects != 0) { - sz = min((sector_t) PAGE_SIZE, nr_sects << 9); - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); - nr_sects -= bi_size >> 9; - sector += bi_size >> 9; - if (bi_size < sz) - break; - } - cond_resched(); - } + return ret; - *biop = bio; -out: - return ret; + return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop); } EXPORT_SYMBOL(__blkdev_issue_zeroout); -- cgit v1.2.3 From d5ce4c31d6df518dd8f63bbae20d7423c5018a6c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Oct 2017 15:59:10 +0200 Subject: block: cope with WRITE ZEROES failing in blkdev_issue_zeroout() sd_config_write_same() ignores ->max_ws_blocks == 0 and resets it to permit trying WRITE SAME on older SCSI devices, unless ->no_write_same is set. Because REQ_OP_WRITE_ZEROES is implemented in terms of WRITE SAME, blkdev_issue_zeroout() may fail with -EREMOTEIO: $ fallocate -zn -l 1k /dev/sdg fallocate: fallocate failed: Remote I/O error $ fallocate -zn -l 1k /dev/sdg # OK $ fallocate -zn -l 1k /dev/sdg # OK The following calls succeed because sd_done() sets ->no_write_same in response to a sense that would become BLK_STS_TARGET/-EREMOTEIO, causing __blkdev_issue_zeroout() to fall back to generating ZERO_PAGE bios. This means blkdev_issue_zeroout() must cope with WRITE ZEROES failing and fall back to manually zeroing, unless BLKDEV_ZERO_NOFALLBACK is specified. For BLKDEV_ZERO_NOFALLBACK case, return -EOPNOTSUPP if sd_done() has just set ->no_write_same thus indicating lack of offload support. Fixes: c20cfc27a473 ("block: stop using blkdev_issue_write_same for zeroing") Cc: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-lib.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 9d2ab8bba52ac..f625fda5f0955 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -321,12 +321,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. * - * Note that this function may fail with -EOPNOTSUPP if the driver signals - * zeroing offload support, but the device fails to process the command (for - * some devices there is no non-destructive way to verify whether this - * operation is actually supported). In this case the caller should call - * retry the call to blkdev_issue_zeroout() and the fallback path will be used. - * * If a device is using logical block provisioning, the underlying space will * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. * @@ -370,18 +364,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { - int ret; - struct bio *bio = NULL; + int ret = 0; + sector_t bs_mask; + struct bio *bio; struct blk_plug plug; + bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + +retry: + bio = NULL; blk_start_plug(&plug); - ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, - &bio, flags); + if (try_write_zeroes) { + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, &bio, flags); + } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, + gfp_mask, &bio); + } else { + /* No zeroing offload support */ + ret = -EOPNOTSUPP; + } if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); + if (ret && try_write_zeroes) { + if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + try_write_zeroes = false; + goto retry; + } + if (!bdev_write_zeroes_sectors(bdev)) { + /* + * Zeroing offload support was indicated, but the + * device reported ILLEGAL REQUEST (for some devices + * there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported). + */ + ret = -EOPNOTSUPP; + } + } return ret; } -- cgit v1.2.3 From 2527d99789e248576ac8081530cd4fd88730f8c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:33:42 -0600 Subject: elevator: lookup mq vs non-mq elevators If an IO scheduler is selected via elevator= and it doesn't match the driver in question wrt blk-mq support, then we fail to boot. The elevator= parameter is deprecated and only supported for non-mq devices. Augment the elevator lookup API so that we pass in if we're looking for an mq capable scheduler or not, so that we only ever return a valid type for the queue in question. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196695 Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/elevator.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 7ae50eb2732bb..77856bf295688 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,12 +83,15 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static struct elevator_type *elevator_find(const char *name) +/* + * Return scheduler with name 'name' and with matching 'mq capability + */ +static struct elevator_type *elevator_find(const char *name, bool mq) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (!strcmp(e->elevator_name, name)) + if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq)) return e; } @@ -100,25 +103,25 @@ static void elevator_put(struct elevator_type *e) module_put(e->elevator_owner); } -static struct elevator_type *elevator_get(const char *name, bool try_loading) +static struct elevator_type *elevator_get(struct request_queue *q, + const char *name, bool try_loading) { struct elevator_type *e; spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name); + e = elevator_find(name, q->mq_ops != NULL); } if (e && !try_module_get(e->elevator_owner)) e = NULL; spin_unlock(&elv_list_lock); - return e; } @@ -144,8 +147,12 @@ void __init load_default_elevator_module(void) if (!chosen_elevator[0]) return; + /* + * Boot parameter is deprecated, we haven't supported that for MQ. + * Only look for non-mq schedulers from here. + */ spin_lock(&elv_list_lock); - e = elevator_find(chosen_elevator); + e = elevator_find(chosen_elevator, false); spin_unlock(&elv_list_lock); if (!e) @@ -202,7 +209,7 @@ int elevator_init(struct request_queue *q, char *name) q->boundary_rq = NULL; if (name) { - e = elevator_get(name, true); + e = elevator_get(q, name, true); if (!e) return -EINVAL; } @@ -214,7 +221,7 @@ int elevator_init(struct request_queue *q, char *name) * allowed from async. */ if (!e && !q->mq_ops && *chosen_elevator) { - e = elevator_get(chosen_elevator, false); + e = elevator_get(q, chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); @@ -229,17 +236,17 @@ int elevator_init(struct request_queue *q, char *name) */ if (q->mq_ops) { if (q->nr_hw_queues == 1) - e = elevator_get("mq-deadline", false); + e = elevator_get(q, "mq-deadline", false); if (!e) return 0; } else - e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); + e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " \ "Using noop.\n"); - e = elevator_get("noop", false); + e = elevator_get(q, "noop", false); } } @@ -905,7 +912,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name)) { + if (elevator_find(e->elevator_name, e->uses_mq)) { spin_unlock(&elv_list_lock); if (e->icq_cache) kmem_cache_destroy(e->icq_cache); @@ -1066,7 +1073,7 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(strstrip(elevator_name), true); + e = elevator_get(q, strstrip(elevator_name), true); if (!e) return -EINVAL; @@ -1076,15 +1083,6 @@ static int __elevator_change(struct request_queue *q, const char *name) return 0; } - if (!e->uses_mq && q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - if (e->uses_mq && !q->mq_ops) { - elevator_put(e); - return -EINVAL; - } - return elevator_switch(q, e); } -- cgit v1.2.3 From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:35:02 -0600 Subject: elevator: allow name aliases Since we now lookup elevator types with the appropriate multiqueue capability, allow schedulers to register with an alias alongside the real name. This is in preparation for allowing 'mq-deadline' to register an alias of 'deadline' as well. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/elevator.c | 23 +++++++++++++++++------ include/linux/elevator.h | 1 + 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 77856bf295688..7bda083d59684 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,6 +83,16 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); +static bool elevator_match(const struct elevator_type *e, const char *name) +{ + if (!strcmp(e->elevator_name, name)) + return true; + if (e->elevator_alias && !strcmp(e->elevator_alias, name)) + return true; + + return false; +} + /* * Return scheduler with name 'name' and with matching 'mq capability */ @@ -91,7 +101,7 @@ static struct elevator_type *elevator_find(const char *name, bool mq) struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq)) + if (elevator_match(e, name) && (mq == e->uses_mq)) return e; } @@ -922,9 +932,9 @@ int elv_register(struct elevator_type *e) spin_unlock(&elv_list_lock); /* print pretty message */ - if (!strcmp(e->elevator_name, chosen_elevator) || + if (elevator_match(e, chosen_elevator) || (!*chosen_elevator && - !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) + elevator_match(e, CONFIG_DEFAULT_IOSCHED))) def = " (default)"; printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, @@ -1077,8 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name) if (!e) return -EINVAL; - if (q->elevator && - !strcmp(elevator_name, q->elevator->type->elevator_name)) { + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { elevator_put(e); return 0; } @@ -1114,6 +1123,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_queue *e = q->elevator; struct elevator_type *elv = NULL; struct elevator_type *__e; + bool uses_mq = q->mq_ops != NULL; int len = 0; if (!queue_is_rq_based(q)) @@ -1126,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) { + if (elv && elevator_match(elv, __e->elevator_name) && + (__e->uses_mq == uses_mq)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 5bc8f8682a3e1..6df8b14f1f6a0 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -144,6 +144,7 @@ struct elevator_type size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; + const char *elevator_alias; struct module *elevator_owner; bool uses_mq; #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.3 From 4d740bc9f0319229410d11e445017f47e425dbe0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 09:47:20 -0600 Subject: mq-deadline: add 'deadline' as a name alias The scheduler framework now supports looking up the appropriate scheduler with the {name,mq} tupple. We can register mq-deadline with the alias of 'deadline', so that switching to 'deadline' will do the right thing based on the type of driver attached to it. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/mq-deadline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index a1cad4331eddc..0179e484ec98c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = { #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", + .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); -- cgit v1.2.3 From 2dd4122854f697afc777582d18548dded03ce5dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:20:01 +0200 Subject: nvme: use kref_get_unless_zero in nvme_find_get_ns For kref_get_unless_zero to protect against lookup vs free races we need to use it in all places where we aren't guaranteed to already hold a reference. There is no such guarantee in nvme_find_get_ns, so switch to kref_get_unless_zero in this function. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7fae42d595d5d..1d931deac83bc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2290,7 +2290,8 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { if (ns->ns_id == nsid) { - kref_get(&ns->kref); + if (!kref_get_unless_zero(&ns->kref)) + continue; ret = ns; break; } -- cgit v1.2.3 From c6424a90da446cff6c67be06767fc0d0be787110 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:22:00 +0200 Subject: nvme: simplify nvme_open Now that we are protected against lookup vs free races for the namespace by using kref_get_unless_zero we don't need the hack of NULLing out the disk private data during removal. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1d931deac83bc..9f8ae15c9fe8e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -253,12 +253,6 @@ static void nvme_free_ns(struct kref *kref) if (ns->ndev) nvme_nvm_unregister(ns); - if (ns->disk) { - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - } - put_disk(ns->disk); ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); nvme_put_ctrl(ns->ctrl); @@ -270,29 +264,6 @@ static void nvme_put_ns(struct nvme_ns *ns) kref_put(&ns->kref, nvme_free_ns); } -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) -{ - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = disk->private_data; - if (ns) { - if (!kref_get_unless_zero(&ns->kref)) - goto fail; - if (!try_module_get(ns->ctrl->ops->module)) - goto fail_put_ns; - } - spin_unlock(&dev_list_lock); - - return ns; - -fail_put_ns: - kref_put(&ns->kref, nvme_free_ns); -fail: - spin_unlock(&dev_list_lock); - return NULL; -} - struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid) { @@ -1056,7 +1027,16 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_open(struct block_device *bdev, fmode_t mode) { - return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; + struct nvme_ns *ns = bdev->bd_disk->private_data; + + if (!kref_get_unless_zero(&ns->kref)) + return -ENXIO; + if (!try_module_get(ns->ctrl->ops->module)) { + kref_put(&ns->kref, nvme_free_ns); + return -ENXIO; + } + + return 0; } static void nvme_release(struct gendisk *disk, fmode_t mode) -- cgit v1.2.3 From d22524a4782a943bb02a9cf6885ac470210aabfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:25:42 +0200 Subject: nvme: switch controller refcounting to use struct device Instead of allocating a separate struct device for the character device handle embedd it into struct nvme_ctrl and use it for the main controller refcounting. This removes double refcounting and gets us an automatic reference for the character device operations. We keep ctrl->device as a pointer for now to avoid chaning printks all over, but in the future we could look into message printing helpers that take a controller structure similar to what other subsystems do. Note the delete_ctrl operation always already has a reference (either through sysfs due this change, or because every open file on the /dev/nvme-fabrics node has a refernece) when it is entered now, so we don't need to do the unless_zero variant there. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 43 ++++++++++++++++++++++--------------------- drivers/nvme/host/fc.c | 8 ++------ drivers/nvme/host/nvme.h | 12 +++++++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 5 ++--- drivers/nvme/target/loop.c | 2 +- 6 files changed, 39 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9f8ae15c9fe8e..3a97daa163f66 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1915,7 +1915,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file) ret = -EWOULDBLOCK; break; } - if (!kref_get_unless_zero(&ctrl->kref)) + if (!kobject_get_unless_zero(&ctrl->device->kobj)) break; file->private_data = ctrl; ret = 0; @@ -2374,7 +2374,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_add_tail(&ns->list, &ctrl->namespaces); mutex_unlock(&ctrl->namespaces_mutex); - kref_get(&ctrl->kref); + nvme_get_ctrl(ctrl); kfree(id); @@ -2703,7 +2703,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { - device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + device_del(ctrl->device); spin_lock(&dev_list_lock); list_del(&ctrl->node); @@ -2711,23 +2711,17 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); -static void nvme_free_ctrl(struct kref *kref) +static void nvme_free_ctrl(struct device *dev) { - struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); - put_device(ctrl->device); ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); } -void nvme_put_ctrl(struct nvme_ctrl *ctrl) -{ - kref_put(&ctrl->kref, nvme_free_ctrl); -} -EXPORT_SYMBOL_GPL(nvme_put_ctrl); - /* * Initialize a NVMe controller structures. This needs to be called during * earliest initialization so that we have the initialized structured around @@ -2742,7 +2736,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); INIT_LIST_HEAD(&ctrl->namespaces); mutex_init(&ctrl->namespaces_mutex); - kref_init(&ctrl->kref); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -2755,15 +2748,21 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, goto out; ctrl->instance = ret; - ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, - MKDEV(nvme_char_major, ctrl->instance), - ctrl, nvme_dev_attr_groups, - "nvme%d", ctrl->instance); - if (IS_ERR(ctrl->device)) { - ret = PTR_ERR(ctrl->device); + device_initialize(&ctrl->ctrl_device); + ctrl->device = &ctrl->ctrl_device; + ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance); + ctrl->device->class = nvme_class; + ctrl->device->parent = ctrl->dev; + ctrl->device->groups = nvme_dev_attr_groups; + ctrl->device->release = nvme_free_ctrl; + dev_set_drvdata(ctrl->device, ctrl); + ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); + if (ret) goto out_release_instance; - } - get_device(ctrl->device); + ret = device_add(ctrl->device); + if (ret) + goto out_free_name; + ida_init(&ctrl->ns_ida); spin_lock(&dev_list_lock); @@ -2779,6 +2778,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); return 0; +out_free_name: + kfree_const(dev->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c6c903f1b172a..aa9aec6923bb7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2692,14 +2692,10 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); int ret; - if (!kref_get_unless_zero(&ctrl->ctrl.kref)) - return -EBUSY; - + nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_fc_del_ctrl(ctrl); - if (!ret) flush_workqueue(nvme_wq); - nvme_put_ctrl(&ctrl->ctrl); return ret; @@ -2918,7 +2914,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return ERR_PTR(ret); } - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cb9d93048f3db..ae60d8342e601 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -127,12 +127,12 @@ struct nvme_ctrl { struct request_queue *admin_q; struct request_queue *connect_q; struct device *dev; - struct kref kref; int instance; struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; struct mutex namespaces_mutex; + struct device ctrl_device; struct device *device; /* char device */ struct list_head node; struct ida ns_ida; @@ -279,6 +279,16 @@ static inline void nvme_end_request(struct request *req, __le16 status, blk_mq_complete_request(req); } +static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) +{ + get_device(ctrl->device); +} + +static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + put_device(ctrl->device); +} + void nvme_complete_rq(struct request *req); void nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 11e6fd9d0ba49..7735571ffc9ab 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2292,7 +2292,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) { dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); - kref_get(&dev->ctrl.kref); + nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); if (!schedule_work(&dev->remove_work)) nvme_put_ctrl(&dev->ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4552744eff454..62b58f9b7d00e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1793,8 +1793,7 @@ static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) * Keep a reference until all work is flushed since * __nvme_rdma_del_ctrl can free the ctrl mem */ - if (!kref_get_unless_zero(&ctrl->ctrl.kref)) - return -EBUSY; + nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_rdma_del_ctrl(ctrl); if (!ret) flush_work(&ctrl->delete_work); @@ -1955,7 +1954,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index c56354e1e4c61..f83e925fe64a6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -642,7 +642,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); -- cgit v1.2.3 From a6a5149b10ec8ab8b4a9479a8230265c1b573be0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 16:59:25 +0200 Subject: nvme: get rid of nvme_ctrl_list Use the core chrdev code to set up the link between the character device and the nvme controller. This allows us to get rid of the global list of all controllers, and also ensures that we have both a reference to the controller and the transport module before the open method of the character device is called. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 76 ++++++++++-------------------------------------- drivers/nvme/host/nvme.h | 3 +- 2 files changed, 18 insertions(+), 61 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3a97daa163f66..a56a1e0432e7b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5; module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - static unsigned long default_ps_max_latency_us = 100000; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, @@ -71,11 +68,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); -static LIST_HEAD(nvme_ctrl_list); -static DEFINE_SPINLOCK(dev_list_lock); - static DEFINE_IDA(nvme_instance_ida); - +static dev_t nvme_chr_devt; static struct class *nvme_class; static __le32 nvme_get_log_dw10(u8 lid, size_t size) @@ -1031,20 +1025,12 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) if (!kref_get_unless_zero(&ns->kref)) return -ENXIO; - if (!try_module_get(ns->ctrl->ops->module)) { - kref_put(&ns->kref, nvme_free_ns); - return -ENXIO; - } - return 0; } static void nvme_release(struct gendisk *disk, fmode_t mode) { - struct nvme_ns *ns = disk->private_data; - - module_put(ns->ctrl->ops->module); - nvme_put_ns(ns); + nvme_put_ns(disk->private_data); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -1902,33 +1888,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify); static int nvme_dev_open(struct inode *inode, struct file *file) { - struct nvme_ctrl *ctrl; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(ctrl, &nvme_ctrl_list, node) { - if (ctrl->instance != instance) - continue; - - if (!ctrl->admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kobject_get_unless_zero(&ctrl->device->kobj)) - break; - file->private_data = ctrl; - ret = 0; - break; - } - spin_unlock(&dev_list_lock); - - return ret; -} + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); -static int nvme_dev_release(struct inode *inode, struct file *file) -{ - nvme_put_ctrl(file->private_data); + if (!ctrl->admin_q) + return -EWOULDBLOCK; + file->private_data = ctrl; return 0; } @@ -1992,7 +1957,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, - .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = nvme_dev_ioctl, }; @@ -2703,11 +2667,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { - device_del(ctrl->device); - - spin_lock(&dev_list_lock); - list_del(&ctrl->node); - spin_unlock(&dev_list_lock); + cdev_device_del(&ctrl->cdev, ctrl->device); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); @@ -2750,7 +2710,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance); + ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; ctrl->device->groups = nvme_dev_attr_groups; @@ -2759,16 +2719,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); if (ret) goto out_release_instance; - ret = device_add(ctrl->device); + + cdev_init(&ctrl->cdev, &nvme_dev_fops); + ctrl->cdev.owner = ops->module; + ret = cdev_device_add(&ctrl->cdev, ctrl->device); if (ret) goto out_free_name; ida_init(&ctrl->ns_ida); - spin_lock(&dev_list_lock); - list_add_tail(&ctrl->node, &nvme_ctrl_list); - spin_unlock(&dev_list_lock); - /* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST. @@ -2909,12 +2868,9 @@ int __init nvme_core_init(void) if (!nvme_wq) return -ENOMEM; - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); + result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); if (result < 0) goto destroy_wq; - else if (result > 0) - nvme_char_major = result; nvme_class = class_create(THIS_MODULE, "nvme"); if (IS_ERR(nvme_class)) { @@ -2925,7 +2881,7 @@ int __init nvme_core_init(void) return 0; unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_wq: destroy_workqueue(nvme_wq); return result; @@ -2934,7 +2890,7 @@ destroy_wq: void nvme_core_exit(void) { class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_workqueue(nvme_wq); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae60d8342e601..1bb2bc165e543 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -15,6 +15,7 @@ #define _NVME_H #include +#include #include #include #include @@ -134,7 +135,7 @@ struct nvme_ctrl { struct mutex namespaces_mutex; struct device ctrl_device; struct device *device; /* char device */ - struct list_head node; + struct cdev cdev; struct ida ns_ida; struct work_struct reset_work; -- cgit v1.2.3 From 999ada28713d7bcf4a2c70cbab47b08cd95f2cf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 17:09:31 +0200 Subject: nvme: check for a live controller in nvme_dev_open This is a much more sensible check than just the admin queue. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a56a1e0432e7b..df525ab42fcd1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1891,7 +1891,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file) struct nvme_ctrl *ctrl = container_of(inode->i_cdev, struct nvme_ctrl, cdev); - if (!ctrl->admin_q) + if (ctrl->state != NVME_CTRL_LIVE) return -EWOULDBLOCK; file->private_data = ctrl; return 0; -- cgit v1.2.3 From 3b3387620780fc9699021c85bdce5cb45a763d41 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:06 -0700 Subject: nvme: add duplicate_connect option Add the "duplicate_connect" boolean option (presence means true). Default is false. When false, the transport should validate whether a new controller request is targeted for the same host transport addressing and target transport addressing as an existing controller. If so, the new controller request should be rejected. When true, the callee is explicitly requesting a duplicate controller connection to be made and the new request should be attempted. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 7 ++++++- drivers/nvme/host/fabrics.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8bca36a469245..4a83137b02684 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -548,6 +548,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, + { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, { NVMF_OPT_ERR, NULL } }; @@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_io_queues = num_online_cpus(); opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; + opts->duplicate_connect = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } break; + case NVMF_OPT_DUP_CONNECT: + opts->duplicate_connect = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID) + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index bf33663218cd2..a7590cc59ba26 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -57,6 +57,7 @@ enum { NVMF_OPT_HOST_TRADDR = 1 << 10, NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, NVMF_OPT_HOST_ID = 1 << 12, + NVMF_OPT_DUP_CONNECT = 1 << 13, }; /** @@ -96,6 +97,7 @@ struct nvmf_ctrl_options { unsigned int nr_io_queues; unsigned int reconnect_delay; bool discovery_nqn; + bool duplicate_connect; unsigned int kato; struct nvmf_host *host; int max_reconnects; -- cgit v1.2.3 From 991231dc48ae9bcdd363ee231f10beb771d455c1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:07 -0700 Subject: nvme: add helper to compare options to controller Adds a helper function that compares the host and subsytem specified in a connect options list vs a controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a7590cc59ba26..42232e731f19f 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -133,6 +133,18 @@ struct nvmf_transport_ops { struct nvmf_ctrl_options *opts); }; +static inline bool +nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || + strcmp(opts->host->nqn, ctrl->opts->host->nqn) || + memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) + return false; + + return true; +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); -- cgit v1.2.3 From 36e835f2432d8f16845307b5fb7e88d1e5af041d Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:09 -0700 Subject: nvme-rdma: add support for duplicate_connect option Adds support for the duplicate_connect option. When set to true, checks whether there's an existing controller via the same target address (traddr), target port (trsvcid), and if specified, host address (host_traddr). Fails the connection request if there is an existing controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 62b58f9b7d00e..32b0a9ef26e67 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1851,6 +1851,83 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reinit_request = nvme_rdma_reinit_request, }; +static inline bool +__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + char *stdport = __stringify(NVME_RDMA_IP_PORT); + + + if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) || + strcmp(opts->traddr, ctrl->ctrl.opts->traddr)) + return false; + + if (opts->mask & NVMF_OPT_TRSVCID && + ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid)) + return false; + } else if (opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, stdport)) + return false; + } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(stdport, ctrl->ctrl.opts->trsvcid)) + return false; + } + /* else, it's a match as both have stdport. Fall to next checks */ + + /* + * checking the local address is rough. In most cases, one + * is not specified and the host port is selected by the stack. + * + * Assume no match if: + * local address is specified and address is not the same + * local address is not specified but remote is, or vice versa + * (admin using specific host_traddr when it matters). + */ + if (opts->mask & NVMF_OPT_HOST_TRADDR && + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr)) + return false; + } else if (opts->mask & NVMF_OPT_HOST_TRADDR || + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) + return false; + /* + * if neither controller had an host port specified, assume it's + * a match as everything else matched. + */ + + return true; +} + +/* + * Fails a connection request if it matches an existing controller + * (association) with the same tuple: + * + * + * if local address is not specified in the request, it will match an + * existing controller with all the other parameters the same and no + * local port address specified as well. + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + found = __nvme_rdma_options_match(ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + return found; +} + static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -1887,6 +1964,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, } } + if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) -- cgit v1.2.3 From 56d5f4f108efd4e439d2320738e2b894af0920bb Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:08 -0700 Subject: nvme-fc: add support for duplicate_connect option Adds support for the duplicate_connect option. When set to true, checks whether there's an existing controller via the same host port and target port for the same host (hostnqn, hostid) to the same subsystem. Fails the connection request if an existing controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa9aec6923bb7..d12775d129835 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2792,6 +2792,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = { }; +/* + * Fails a controller request if it matches an existing controller + * (association) with the same tuple: + * + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_fc_existing_controller(struct nvme_fc_rport *rport, + struct nvmf_ctrl_options *opts) +{ + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&rport->lock, flags); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts); + if (found) + break; + } + spin_unlock_irqrestore(&rport->lock, flags); + + return found; +} + static struct nvme_ctrl * nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) @@ -2806,6 +2833,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } + if (!opts->duplicate_connect && + nvme_fc_existing_controller(rport, opts)) { + ret = -EALREADY; + goto out_fail; + } + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) { ret = -ENOMEM; -- cgit v1.2.3 From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 23 Oct 2017 15:11:36 -0700 Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS The define is an arbitrary limit to the io size on the initiator, capping the io to 1MB-4KB. Remove the define from the transport. I/O size will solely be limited by the LLDD sg limits. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 ++---- include/linux/nvme-fc-driver.h | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d12775d129835..b600c07803cf3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2435,7 +2435,6 @@ static int nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - u32 segs; int ret; bool changed; @@ -2486,9 +2485,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - segs = min_t(u32, NVME_FC_MAX_SEGMENTS, - ctrl->lport->ops->max_sgl_segments); - ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_hw_sectors = + (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); ret = nvme_init_identify(&ctrl->ctrl); if (ret) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4ea03b9a5c8cb..2be4db3539379 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -102,8 +102,6 @@ enum nvmefc_fcp_datadir { }; -#define NVME_FC_MAX_SEGMENTS 256 - /** * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport * to LLDD in order to perform a NVME FCP IO operation. -- cgit v1.2.3 From 4e9b6f20828ac880dbc1fa2fdbafae779473d1af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 19 Oct 2017 10:00:48 -0700 Subject: block: Fix a race between blk_cleanup_queue() and timeout handling Make sure that if the timeout timer fires after a queue has been marked "dying" that the affected requests are finished. Reported-by: chenxiang (M) Fixes: commit 287922eb0b18 ("block: defer timeouts to a workqueue") Signed-off-by: Bart Van Assche Tested-by: chenxiang (M) Cc: Christoph Hellwig Cc: Keith Busch Cc: Hannes Reinecke Cc: Ming Lei Cc: Johannes Thumshirn Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-timeout.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index e8e149ccc86b7..bb4fce694a60c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -333,6 +333,7 @@ EXPORT_SYMBOL(blk_stop_queue); void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); + cancel_work_sync(&q->timeout_work); if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; @@ -844,6 +845,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, NULL); INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index e3e9c9771d361..764ecf9aeb305 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work) struct request *rq, *tmp; int next_set = 0; - if (blk_queue_enter(q, true)) - return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); - blk_queue_exit(q); } /** -- cgit v1.2.3 From d59b23795933678c9638fd20c942d2b4f3cd6185 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 30 Oct 2017 14:46:31 -0700 Subject: bcache: only permit to recovery read error when cache device is clean When bcache does read I/Os, for example in writeback or writethrough mode, if a read request on cache device is failed, bcache will try to recovery the request by reading from cached device. If the data on cached device is not synced with cache device, then requester will get a stale data. For critical storage system like database, providing stale data from recovery may result an application level data corruption, which is unacceptible. With this patch, for a failed read request in writeback or writethrough mode, recovery a recoverable read request only happens when cache device is clean. That is to say, all data on cached device is up to update. For other cache modes in bcache, read request will never hit cached_dev_read_error(), they don't need this patch. Please note, because cache mode can be switched arbitrarily in run time, a writethrough mode might be switched from a writeback mode. Therefore checking dc->has_data in writethrough mode still makes sense. Changelog: V4: Fix parens error pointed by Michael Lyle. v3: By response from Kent Oversteet, he thinks recovering stale data is a bug to fix, and option to permit it is unnecessary. So this version the sysfs file is removed. v2: rename sysfs entry from allow_stale_data_on_failure to allow_stale_data_on_failure, and fix the confusing commit log. v1: initial patch posted. [small change to patch comment spelling by mlyle] Signed-off-by: Coly Li Signed-off-by: Michael Lyle Reported-by: Arne Wolf Reviewed-by: Michael Lyle Cc: Kent Overstreet Cc: Nix Cc: Kai Krakow Cc: Eric Wheeler Cc: Junhui Tang Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 163a17a808741..886e4b6643f1b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -705,8 +705,16 @@ static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - if (s->recoverable) { + /* + * If cache device is dirty (dc->has_dirty is non-zero), then + * recovery a failed read request from cached device may get a + * stale data back. So read failure recovery is only permitted + * when cache device is clean. + */ + if (s->recoverable && + (dc && !atomic_read(&dc->has_dirty))) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); -- cgit v1.2.3 From 3b304d24a718ae779ee9c7f2014dd3b2d0893b70 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 30 Oct 2017 14:46:32 -0700 Subject: bcache: convert cached_dev.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable cached_dev.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Michael Lyle Signed-off-by: Elena Reshetova Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 7 ++++--- drivers/md/bcache/super.c | 6 +++--- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d77c4829c4972..363ea6256b393 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -184,6 +184,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,7 @@ struct cached_dev { struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; + refcount_t count; struct work_struct detach; /* @@ -805,13 +806,13 @@ do { \ static inline void cached_dev_put(struct cached_dev *dc) { - if (atomic_dec_and_test(&dc->count)) + if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { - if (!atomic_inc_not_zero(&dc->count)) + if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 72c3b2929ef02..46134c45c6f69 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -902,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); + BUG_ON(refcount_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -1029,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ - atomic_set(&dc->count, 1); + refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ down_write(&dc->writeback_lock); @@ -1041,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); + refcount_inc(&dc->count); bch_writeback_queue(dc); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 34bcf49d737bd..7d25bff37a9bf 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -91,7 +91,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + refcount_inc(&dc->count); if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); -- cgit v1.2.3 From d44c2f9e7cc0041f0cd88df1fe7a1fceb713ab14 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Mon, 30 Oct 2017 14:46:33 -0700 Subject: bcache: update bucket_in_use in real time bucket_in_use is updated in gc thread which triggered by invalidating or writing sectors_to_gc dirty data, It's a long interval. Therefore, when we use it to compare with the threshold, it is often not timely, which leads to inaccurate judgment and often results in bucket depletion. We have send a patch before, by the means of updating bucket_in_use periodically In gc thread, which Coly thought that would lead high latency, In this patch, we add avail_nbuckets to record the count of available buckets, and we calculate bucket_in_use when alloc or free bucket in real time. [edited by ML: eliminated some whitespace errors] Signed-off-by: Tang Junhui Signed-off-by: Michael Lyle Reviewed-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 10 ++++++++++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 17 ++++++++++------- drivers/md/bcache/btree.h | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c40870e99f5a..8c5a626343d4d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -442,6 +442,11 @@ out: b->prio = INITIAL_PRIO; } + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + return r; } @@ -449,6 +454,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } } void bch_bucket_free(struct cache_set *c, struct bkey *k) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 363ea6256b393..e274082330dcd 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -581,6 +581,7 @@ struct cache_set { uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; + size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 866dcf78ff8e6..d8865e6ead370 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1240,6 +1240,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) __bch_btree_mark_key(c, level, k); } +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1651,9 +1656,8 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -static size_t bch_btree_gc_finish(struct cache_set *c) +static void bch_btree_gc_finish(struct cache_set *c) { - size_t available = 0; struct bucket *b; struct cache *ca; unsigned i; @@ -1690,6 +1694,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) } rcu_read_unlock(); + c->avail_nbuckets = 0; for_each_cache(ca, c, i) { uint64_t *i; @@ -1711,18 +1716,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - available++; + c->avail_nbuckets++; } } mutex_unlock(&c->bucket_lock); - return available; } static void bch_btree_gc(struct cache_set *c) { int ret; - unsigned long available; struct gc_stat stats; struct closure writes; struct btree_op op; @@ -1745,14 +1748,14 @@ static void bch_btree_gc(struct cache_set *c) pr_warn("gc failed!"); } while (ret); - available = bch_btree_gc_finish(c); + bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; - stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73da1f5626cb8..4073aca09a498 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -305,5 +305,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); - +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif -- cgit v1.2.3 From c157313791a999646901b3e3c6888514ebc36d62 Mon Sep 17 00:00:00 2001 From: "tang.junhui" Date: Mon, 30 Oct 2017 14:46:34 -0700 Subject: bcache: fix wrong cache_misses statistics Currently, Cache missed IOs are identified by s->cache_miss, but actually, there are many situations that missed IOs are not assigned a value for s->cache_miss in cached_dev_cache_miss(), for example, a bypassed IO (s->iop.bypass = 1), or the cache_bio allocate failed. In these situations, it will go to out_put or out_submit, and s->cache_miss is null, which leads bch_mark_cache_accounting() to treat this IO as a hit IO. [ML: applied by 3-way merge] Signed-off-by: tang.junhui Reviewed-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 886e4b6643f1b..597dd1e87beab 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -470,6 +470,7 @@ struct search { unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; + unsigned cache_missed:1; unsigned long start_time; @@ -656,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio, s->orig_bio = bio; s->cache_miss = NULL; + s->cache_missed = 0; s->d = d; s->recoverable = 1; s->write = op_is_write(bio_op(bio)); @@ -775,7 +777,7 @@ static void cached_dev_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, - !s->cache_miss, s->iop.bypass); + !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); if (s->iop.status) @@ -794,6 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + s->cache_missed = 1; + if (s->cache_miss || s->iop.bypass) { miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; -- cgit v1.2.3 From 330a4db89d39a6b43f36da16824eaa7a7509d34d Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Mon, 30 Oct 2017 14:46:35 -0700 Subject: bcache: explicitly destroy mutex while exiting mutex_destroy does nothing most of time, but it's better to call it to make the code future proof and it also has some meaning for like mutex debug. As Coly pointed out in a previous review, bcache_exit() may not be able to handle all the references properly if userspace registers cache and backing devices right before bch_debug_init runs and bch_debug_init failes later. So not exposing userspace interface until everything is ready to avoid that issue. Signed-off-by: Liang Chen Reviewed-by: Michael Lyle Reviewed-by: Coly Li Reviewed-by: Eric Wheeler Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46134c45c6f69..b4d28928dec59 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2095,6 +2095,7 @@ static void bcache_exit(void) if (bcache_major) unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); } static int __init bcache_init(void) @@ -2113,14 +2114,15 @@ static int __init bcache_init(void) bcache_major = register_blkdev(0, "bcache"); if (bcache_major < 0) { unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); return bcache_major; } if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - sysfs_create_files(bcache_kobj, files) || bch_request_init() || - bch_debug_init(bcache_kobj)) + bch_debug_init(bcache_kobj) || + sysfs_create_files(bcache_kobj, files)) goto err; return 0; -- cgit v1.2.3 From 0e7d3a8d4ec3f662faa7ef7f2957459b33f2481d Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Tue, 31 Oct 2017 14:15:02 -0600 Subject: MAINTAINERS: Remove Rafael from Opal maintainers. He is no longer working on storage. Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c0bbcf1c81884..6e9343af6bbfa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12041,7 +12041,6 @@ F: drivers/mmc/host/sdhci-spear.c SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER M: Scott Bauer M: Jonathan Derrick -M: Rafael Antognolli L: linux-block@vger.kernel.org S: Supported F: block/sed* -- cgit v1.2.3 From 5e3d02bbafad38975099b5848f5ebadedcf7bb7e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:25 +0800 Subject: blk-mq-sched: dispatch from scheduler IFF progress is made in ->dispatch When the hw queue is busy, we shouldn't take requests from the scheduler queue any more, otherwise it is difficult to do IO merge. This patch fixes the awful IO performance on some SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber is used by not taking requests if hw queue is busy. Reviewed-by: Omar Sandoval Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4ab69435708c2..eca011fdfa0ed 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool did_work = false; + bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - did_work = blk_mq_dispatch_rq_list(q, &rq_list); + do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } /* - * We want to dispatch from the scheduler if we had no work left - * on the dispatch list, OR if we did have work but weren't able - * to make progress. + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. */ - if (!did_work && has_sched_dispatch) { + if (do_sched_dispatch && has_sched_dispatch) { do { struct request *rq; -- cgit v1.2.3 From caf8eb0d604a0eaeb8111eb4d36853a6d08eebe7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:26 +0800 Subject: blk-mq-sched: move actual dispatching into one helper So that it becomes easy to support to dispatch from sw queue in the following patch. No functional change. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Suggested-by: Christoph Hellwig # for simplifying dispatch logic Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index eca011fdfa0ed..be29ba8494085 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } +static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; + LIST_HEAD(rq_list); + + do { + struct request *rq = e->type->ops.mq.dispatch_request(hctx); + + if (!rq) + break; + list_add(&rq->queuelist, &rq_list); + } while (blk_mq_dispatch_rq_list(q, &rq_list)); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. + * + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list); - } else if (!has_sched_dispatch) { + if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch) + blk_mq_do_dispatch_sched(hctx); + } else if (has_sched_dispatch) { + blk_mq_do_dispatch_sched(hctx); + } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } - - /* - * We want to dispatch from the scheduler if there was nothing - * on the dispatch list or we were able to dispatch from the - * dispatch list. - */ - if (do_sched_dispatch && has_sched_dispatch) { - do { - struct request *rq; - - rq = e->type->ops.mq.dispatch_request(hctx); - if (!rq) - break; - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); - } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, -- cgit v1.2.3 From 7930d0a00ff5dbcc80f793d1a7a6b8de4e591f1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:27 +0800 Subject: sbitmap: introduce __sbitmap_for_each_set() For blk-mq, we need to be able to iterate software queues starting from any queue in a round robin fashion, so introduce this helper. Reviewed-by: Omar Sandoval Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a1904aadbc450..0dcc60e820de7 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb); */ bool sbitmap_any_bit_clear(const struct sbitmap *sb); +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** - * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. @@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ -static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, - void *data) +static inline void __sbitmap_for_each_set(struct sbitmap *sb, + unsigned int start, + sb_for_each_fn fn, void *data) { - unsigned int i; + unsigned int index; + unsigned int nr; + unsigned int scanned = 0; - for (i = 0; i < sb->map_nr; i++) { - struct sbitmap_word *word = &sb->map[i]; - unsigned int off, nr; + if (start >= sb->depth) + start = 0; + index = SB_NR_TO_INDEX(sb, start); + nr = SB_NR_TO_BIT(sb, start); - if (!word->word) - continue; + while (scanned < sb->depth) { + struct sbitmap_word *word = &sb->map[index]; + unsigned int depth = min_t(unsigned int, word->depth - nr, + sb->depth - scanned); - nr = 0; - off = i << sb->shift; + scanned += depth; + if (!word->word) + goto next; + + /* + * On the first iteration of the outer loop, we need to add the + * bit offset back to the size of the word for find_next_bit(). + * On all other iterations, nr is zero, so this is a noop. + */ + depth += nr; while (1) { - nr = find_next_bit(&word->word, word->depth, nr); - if (nr >= word->depth) + nr = find_next_bit(&word->word, depth, nr); + if (nr >= depth) break; - - if (!fn(sb, off + nr, data)) + if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } +next: + nr = 0; + if (++index >= sb->map_nr) + index = 0; } } -#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) -#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) +/** + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + __sbitmap_for_each_set(sb, 0, fn, data); +} static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) -- cgit v1.2.3 From 63ba8e31c3ac6393b07c6e18538814a730478766 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:28 +0800 Subject: block: kyber: check if there are requests in ctx in kyber_has_work() There may be request in sw queue, and not fetched to domain queue yet, so check it in kyber_has_work(). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index db5bfc6342d34..b4df317c29169 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -649,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) if (!list_empty_careful(&khd->rqs[i])) return true; } - return false; + return sbitmap_any_bit_set(&hctx->ctx_map); } #define KYBER_LAT_SHOW_STORE(op) \ -- cgit v1.2.3 From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:29 +0800 Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops For SCSI devices, there is often a per-request-queue depth, which needs to be respected before queuing one request. Currently blk-mq always dequeues the request first, then calls .queue_rq() to dispatch the request to lld. One obvious issue with this approach is that I/O merging may not be successful, because when the per-request-queue depth can't be respected, .queue_rq() has to return BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch list. This means it never gets a chance to be merged with other IO. This patch introduces .get_budget and .put_budget callback in blk_mq_ops, then we can try to get reserved budget first before dequeuing request. If the budget for queueing I/O can't be satisfied, we don't need to dequeue request at all. Hence the request can be left in the IO scheduler queue, for more merging opportunities. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 55 +++++++++++++++++++++++++++++++++++++++++--------- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 43 ++++++++++++++++++++++++++++++++++----- block/blk-mq.h | 20 +++++++++++++++++- include/linux/blk-mq.h | 11 ++++++++++ 5 files changed, 114 insertions(+), 17 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index be29ba8494085..8e525e66a0d97 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +/* return true if hctx need to run again */ +static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); do { - struct request *rq = e->type->ops.mq.dispatch_request(hctx); + struct request *rq; + blk_status_t ret; - if (!rq) + if (e->type->ops.mq.has_work && + !e->type->ops.mq.has_work(hctx)) break; + + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) + return true; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } else if (ret != BLK_STS_OK) { + blk_mq_end_request(rq, ret); + continue; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list)); + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + return false; } -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +/* return true if hw queue need to be run again */ +bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; LIST_HEAD(rq_list); + bool run_queue = false; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return; + return false; hctx->run++; @@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); + if (blk_mq_dispatch_rq_list(q, &rq_list, false) && + has_sched_dispatch) + run_queue = blk_mq_do_dispatch_sched(hctx); } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); + run_queue = blk_mq_do_dispatch_sched(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list); + blk_mq_dispatch_rq_list(q, &rq_list, false); } + + if (run_queue && !blk_mq_sched_needs_restart(hctx) && + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) { + blk_mq_sched_mark_restart_hctx(hctx); + return true; + } + + return false; } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 9267d0b7c1978..2434061cc5b75 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); diff --git a/block/blk-mq.c b/block/blk-mq.c index 40cba1b1978f2..dcb467369999e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) return true; } -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) +bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, + bool got_budget) { struct blk_mq_hw_ctx *hctx; struct request *rq; @@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) if (list_empty(list)) return false; + WARN_ON(!list_is_singular(list) && got_budget); + /* * Now process all the entries, sending them to the driver. */ @@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. */ - if (!blk_mq_dispatch_wait_add(hctx)) + if (!blk_mq_dispatch_wait_add(hctx)) { + if (got_budget) + blk_mq_put_dispatch_budget(hctx); break; + } /* * It's possible that a tag was freed in the window * between the allocation failure and adding the * hardware queue to the wait queue. */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (got_budget) + blk_mq_put_dispatch_budget(hctx); + break; + } + } + + if (!got_budget) { + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) break; + if (ret != BLK_STS_OK) + goto fail_rq; } list_del_init(&rq->queuelist); @@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) break; } + fail_rq: if (unlikely(ret != BLK_STS_OK)) { errors++; blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; + bool run_queue; /* * We should be running this queue from one of the CPUs that @@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); + run_queue = blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { might_sleep(); srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); + run_queue = blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } + + if (run_queue) + blk_mq_run_hw_queue(hctx, true); } /* @@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) { + blk_mq_put_driver_tag(rq); + goto insert; + } else if (ret != BLK_STS_OK) + goto fail_rq; + new_cookie = request_to_qc_t(hctx, rq); /* @@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, __blk_mq_requeue_request(rq); goto insert; default: + fail_rq: *cookie = BLK_QC_T_NONE; blk_mq_end_request(rq, ret); return; @@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->ops->queue_rq) return -EINVAL; + if (!set->ops->get_budget ^ !set->ops->put_budget) + return -EINVAL; + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); diff --git a/block/blk-mq.h b/block/blk-mq.h index ef15b3414da54..e413b732374e6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); +bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, @@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->put_budget) + q->mq_ops->put_budget(hctx); +} + +static inline blk_status_t blk_mq_get_dispatch_budget( + struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + if (q->mq_ops->get_budget) + return q->mq_ops->get_budget(hctx); + return BLK_STS_OK; +} + #endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f5..901457df3d64d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -90,6 +90,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -111,6 +113,15 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; + /* + * Reserve budget before queue request, once .queue_rq is + * run, it is driver's responsibility to release the + * reserved budget. Also we have to handle failure case + * of .get_budget for avoiding I/O deadlock. + */ + get_budget_fn *get_budget; + put_budget_fn *put_budget; + /* * Called on request timeout */ -- cgit v1.2.3 From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:30 +0800 Subject: blk-mq-sched: improve dispatching from sw queue SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. However, there is also a queue depth for each lun( .cmd_per_lun), which is often small, for example, on both lpfc and qla2xxx, .cmd_per_lun is just 3. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver. Unfortunately it is easy to cause queue busy because of the small .cmd_per_lun. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can happen on these requests, and sequential IO performance is harmed. This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request from a sw queue, so that we can dispatch them in scheduler's way. We can then avoid dequeueing too many requests from sw queue, since we don't flush ->dispatch completely. This patch improves dispatching from sw queue by using the .get_budget and .put_budget callbacks. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-- block/blk-mq.c | 39 ++++++++++++++++++++++++++ block/blk-mq.h | 2 ++ include/linux/blk-mq.h | 2 ++ 4 files changed, 114 insertions(+), 3 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 8e525e66a0d97..df8581bb0a370 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) return false; } +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned idx = ctx->index_hw; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +/* return true if hctx need to run again */ +static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + + do { + struct request *rq; + blk_status_t ret; + + if (!sbitmap_any_bit_set(&hctx->ctx_map)) + break; + + ret = blk_mq_get_dispatch_budget(hctx); + if (ret == BLK_STS_RESOURCE) + return true; + + rq = blk_mq_dequeue_from_ctx(hctx, ctx); + if (!rq) { + blk_mq_put_dispatch_budget(hctx); + break; + } else if (ret != BLK_STS_OK) { + blk_mq_end_request(rq, ret); + continue; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add(&rq->queuelist, &rq_list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + + WRITE_ONCE(hctx->dispatch_from, ctx); + + return false; +} + /* return true if hw queue need to be run again */ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { @@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false) && - has_sched_dispatch) - run_queue = blk_mq_do_dispatch_sched(hctx); + if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (has_sched_dispatch) + run_queue = blk_mq_do_dispatch_sched(hctx); + else + run_queue = blk_mq_do_dispatch_ctx(hctx); + } } else if (has_sched_dispatch) { run_queue = blk_mq_do_dispatch_sched(hctx); + } else if (q->mq_ops->get_budget) { + /* + * If we need to get budget before queuing request, we + * dequeue request one by one from sw queue for avoiding + * to mess up I/O merge when dispatch runs out of resource. + * + * TODO: get more budgets, and dequeue more requests in + * one time. + */ + run_queue = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); diff --git a/block/blk-mq.c b/block/blk-mq.c index dcb467369999e..097ca3ece716e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); +struct dispatch_rq_data { + struct blk_mq_hw_ctx *hctx; + struct request *rq; +}; + +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, + void *data) +{ + struct dispatch_rq_data *dispatch_data = data; + struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + spin_lock(&ctx->lock); + if (unlikely(!list_empty(&ctx->rq_list))) { + dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + list_del_init(&dispatch_data->rq->queuelist); + if (list_empty(&ctx->rq_list)) + sbitmap_clear_bit(sb, bitnr); + } + spin_unlock(&ctx->lock); + + return !dispatch_data->rq; +} + +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start) +{ + unsigned off = start ? start->index_hw : 0; + struct dispatch_rq_data data = { + .hctx = hctx, + .rq = NULL, + }; + + __sbitmap_for_each_set(&hctx->ctx_map, off, + dispatch_rq_from_ctx, &data); + + return data.rq; +} + static inline unsigned int queued_to_index(unsigned int queued) { if (!queued) diff --git a/block/blk-mq.h b/block/blk-mq.h index e413b732374e6..522b420dedc07 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start); /* * Internal helpers for allocating/freeing the request map diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 901457df3d64d..e5e6becd57d3c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned int nr_ctx; -- cgit v1.2.3 From aeec77629a4ac6f8c248f3a82e80d4170a881f22 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:31 +0800 Subject: scsi: allow passing in null rq to scsi_prep_state_check() In the following patch, we will implement scsi_get_budget() which need to call scsi_prep_state_check() when rq isn't dequeued yet. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9cf6a80fe2975..d159bb0857144 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) /* * If the devices is blocked we defer normal commands. */ - if (!(req->rq_flags & RQF_PREEMPT)) + if (req && !(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_DEFER; break; default: @@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) * special commands. In particular any user initiated * command is not allowed. */ - if (!(req->rq_flags & RQF_PREEMPT)) + if (req && !(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_KILL; break; } -- cgit v1.2.3 From 0df21c86bdbfd17dec9ab898312af9bfb74d5d86 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:32 +0800 Subject: scsi: implement .get_budget and .put_budget for blk-mq We need to tell blk-mq to reserve resources before queuing one request, so implement these two callbacks. Then blk-mq can avoid to dequeue request too early, and IO merging can be improved a lot. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 75 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d159bb0857144..6f10afaca25b2 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) { - struct request *req = bd->rq; - struct request_queue *q = req->q; + struct request_queue *q = hctx->queue; + struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost = sdev->host; + + atomic_dec(&shost->host_busy); + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); + atomic_dec(&sdev->device_busy); + put_device(&sdev->sdev_gendev); +} + +static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; - int reason; - ret = prep_to_mq(scsi_prep_state_check(sdev, req)); - if (ret != BLK_STS_OK) - goto out; + ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); + if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK) + return ret; - ret = BLK_STS_RESOURCE; if (!get_device(&sdev->sdev_gendev)) goto out; - if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; if (!scsi_target_queue_ready(shost, sdev)) @@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!scsi_host_queue_ready(q, shost, sdev)) goto out_dec_target_busy; + return BLK_STS_OK; + +out_dec_target_busy: + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); +out_dec_device_busy: + atomic_dec(&sdev->device_busy); +out_put_device: + put_device(&sdev->sdev_gendev); +out: + return BLK_STS_RESOURCE; +} + +static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *req = bd->rq; + struct request_queue *q = req->q; + struct scsi_device *sdev = q->queuedata; + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); + blk_status_t ret; + int reason; + + ret = prep_to_mq(scsi_prep_state_check(sdev, req)); + if (ret != BLK_STS_OK) + goto out_put_budget; + + ret = BLK_STS_RESOURCE; if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret != BLK_STS_OK) - goto out_dec_host_busy; + goto out_put_budget; req->rq_flags |= RQF_DONTPREP; } else { blk_mq_start_request(req); @@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; - goto out_dec_host_busy; + goto out_put_budget; } return BLK_STS_OK; -out_dec_host_busy: - atomic_dec(&shost->host_busy); -out_dec_target_busy: - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); -out_dec_device_busy: - atomic_dec(&sdev->device_busy); -out_put_device: - put_device(&sdev->sdev_gendev); -out: +out_put_budget: + scsi_mq_put_budget(hctx); switch (ret) { case BLK_STS_OK: break; @@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev) } static const struct blk_mq_ops scsi_mq_ops = { + .get_budget = scsi_mq_get_budget, + .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, -- cgit v1.2.3 From 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Oct 2017 12:43:29 +0800 Subject: blk-mq: don't handle TAG_SHARED in restart Now restart is used in the following cases, and TAG_SHARED is for SCSI only. 1) .get_budget() returns BLK_STS_RESOURCE - if resource in target/host level isn't satisfied, this SCSI device will be added in shost->starved_list, and the whole queue will be rerun (via SCSI's built-in RESTART) in scsi_end_request() after any request initiated from this host/targe is completed. Forget to mention, host level resource can't be an issue for blk-mq at all. - the same is true if resource in the queue level isn't satisfied. - if there isn't outstanding request on this queue, then SCSI's RESTART can't work(blk-mq's can't work too), and the queue will be run after SCSI_QUEUE_DELAY, and finally all starved sdevs will be handled by SCSI's RESTART when this request is finished 2) scsi_dispatch_cmd() returns BLK_STS_RESOURCE - if there isn't onprogressing request on this queue, the queue will be run after SCSI_QUEUE_DELAY - otherwise, SCSI's RESTART covers the rerun. 3) blk_mq_get_driver_tag() failed - BLK_MQ_S_TAG_WAITING covers the cross-queue RESTART for driver allocation. In one word, SCSI's built-in RESTART is enough to cover the queue rerun, and we don't need to pay special attention to TAG_SHARED wrt. restart. In my test on scsi_debug(8 luns), this patch improves IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Aslo Roman Pen reported the current RESTART is very expensive especialy when there are lots of LUNs attached in one host, such as in his test, RESTART causes half of IOPS be cut. Fixes: https://marc.info/?l=linux-kernel&m=150832216727524&w=2 Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 78 +++------------------------------------------------- 1 file changed, 4 insertions(+), 74 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index df8581bb0a370..daab27feb6539 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,25 +68,17 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; + return; - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); - return true; + return; } - - return false; } /* return true if hctx need to run again */ @@ -385,68 +377,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - /* * Add flush/fua to the queue. If we fail getting a driver tag, then * punt to the requeue list. Requeue will re-invoke us from a context -- cgit v1.2.3 From 1f460b63d4b37f504d8d0affc2cd492eb005ea97 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 27 Oct 2017 12:43:30 +0800 Subject: blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE SCSI restarts its queue in scsi_end_request() automatically, so we don't need to handle this case in blk-mq. Especailly any request won't be dequeued in this case, we needn't to worry about IO hang caused by restart vs. dispatch. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 45 ++++++++++++++++++++------------------------- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 8 ++------ 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index daab27feb6539..7775f6b12fa94 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -81,8 +81,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) } } -/* return true if hctx need to run again */ -static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; @@ -98,7 +102,7 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) ret = blk_mq_get_dispatch_budget(hctx); if (ret == BLK_STS_RESOURCE) - return true; + break; rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) { @@ -116,8 +120,6 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) */ list_add(&rq->queuelist, &rq_list); } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); - - return false; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -131,8 +133,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, return hctx->ctxs[idx]; } -/* return true if hctx need to run again */ -static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + */ +static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); @@ -147,7 +153,7 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) ret = blk_mq_get_dispatch_budget(hctx); if (ret == BLK_STS_RESOURCE) - return true; + break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { @@ -171,22 +177,19 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); WRITE_ONCE(hctx->dispatch_from, ctx); - - return false; } /* return true if hw queue need to be run again */ -bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; LIST_HEAD(rq_list); - bool run_queue = false; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return false; + return; hctx->run++; @@ -218,12 +221,12 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (has_sched_dispatch) - run_queue = blk_mq_do_dispatch_sched(hctx); + blk_mq_do_dispatch_sched(hctx); else - run_queue = blk_mq_do_dispatch_ctx(hctx); + blk_mq_do_dispatch_ctx(hctx); } } else if (has_sched_dispatch) { - run_queue = blk_mq_do_dispatch_sched(hctx); + blk_mq_do_dispatch_sched(hctx); } else if (q->mq_ops->get_budget) { /* * If we need to get budget before queuing request, we @@ -233,19 +236,11 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * TODO: get more budgets, and dequeue more requests in * one time. */ - run_queue = blk_mq_do_dispatch_ctx(hctx); + blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list, false); } - - if (run_queue && !blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) { - blk_mq_sched_mark_restart_hctx(hctx); - return true; - } - - return false; } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 2434061cc5b75..9267d0b7c1978 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); -bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); diff --git a/block/blk-mq.c b/block/blk-mq.c index 097ca3ece716e..e4d2490f4e7ee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1226,7 +1226,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; - bool run_queue; /* * We should be running this queue from one of the CPUs that @@ -1243,18 +1242,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - run_queue = blk_mq_sched_dispatch_requests(hctx); + blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { might_sleep(); srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - run_queue = blk_mq_sched_dispatch_requests(hctx); + blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); } - - if (run_queue) - blk_mq_run_hw_queue(hctx, true); } /* -- cgit v1.2.3 From 2a750166a5bee7fda186c12269e37ba52b26c017 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Oct 2017 09:02:19 -0700 Subject: block: Rework drivers/cdrom/Makefile Instead of referring from inside drivers/cdrom/Makefile to all the drivers that use this driver, let these drivers select the cdrom driver. This change makes the cdrom build code follow the approach that is used for most other drivers, namely refer from the higher layers to the lower layer instead of from the lower layer to the higher layers. Reviewed-by: Hannes Reinecke Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 12 ++++++++++++ drivers/block/paride/Kconfig | 1 + drivers/cdrom/Makefile | 15 ++------------- drivers/ide/Kconfig | 1 + drivers/scsi/Kconfig | 1 + 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4a438b8abe27a..b998b3c9ea860 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -67,9 +67,20 @@ config AMIGA_Z2RAM To compile this driver as a module, choose M here: the module will be called z2ram. +config CDROM + tristate "CD-ROM driver" + help + A CD-ROM is a pre-pressed optical compact disc which contains + data. The name is an acronym which stands for "Compact Disc + Read-Only Memory". This driver implements functionality that is + shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the + SCSI CD-ROM driver. For a list of the ioctls implemented by this + driver, see also Documentation/ioctl/cdrom.txt. + config GDROM tristate "SEGA Dreamcast GD-ROM drive" depends on SH_DREAMCAST + select CDROM select BLK_SCSI_REQUEST # only for the generic cdrom code help A standard SEGA Dreamcast comes with a modified CD ROM drive called a @@ -347,6 +358,7 @@ config BLK_DEV_RAM_DAX config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media (DEPRECATED)" depends on !UML + select CDROM select BLK_SCSI_REQUEST help Note: This driver is deprecated and will be removed from the diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig index 3a15247942e41..1d5057f5080ba 100644 --- a/drivers/block/paride/Kconfig +++ b/drivers/block/paride/Kconfig @@ -25,6 +25,7 @@ config PARIDE_PD config PARIDE_PCD tristate "Parallel port ATAPI CD-ROMs" depends on PARIDE + select CDROM select BLK_SCSI_REQUEST # only for the generic cdrom code ---help--- This option enables the high-level driver for ATAPI CD-ROM devices diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile index 8ffde4f8ab9ae..7f3f43cc22574 100644 --- a/drivers/cdrom/Makefile +++ b/drivers/cdrom/Makefile @@ -1,13 +1,2 @@ -# Makefile for the kernel cdrom device drivers. -# -# 30 Jan 1998, Michael Elizabeth Chastain, -# Rewritten to use lists instead of if-statements. - -# Each configuration option enables a list of files. - -obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o -obj-$(CONFIG_BLK_DEV_SR) += cdrom.o -obj-$(CONFIG_PARIDE_PCD) += cdrom.o -obj-$(CONFIG_CDROM_PKTCDVD) += cdrom.o - -obj-$(CONFIG_GDROM) += gdrom.o cdrom.o +obj-$(CONFIG_CDROM) += cdrom.o +obj-$(CONFIG_GDROM) += gdrom.o diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index c99a25c075bcb..7b92c591e4c11 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -118,6 +118,7 @@ config BLK_DEV_DELKIN config BLK_DEV_IDECD tristate "Include IDE/ATAPI CDROM support" select IDE_ATAPI + select CDROM ---help--- If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is a newer protocol used by IDE CD-ROM and TAPE drives, similar to the diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 41366339b9501..cca0b03c2eadf 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -131,6 +131,7 @@ config CHR_DEV_OSST config BLK_DEV_SR tristate "SCSI CDROM support" depends on SCSI + select CDROM ---help--- If you want to use a CD or DVD drive attached to your computer by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO -- cgit v1.2.3 From 71c691fd06cc2625966620c93ce21bdcce32ed95 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 28 Sep 2017 09:56:31 -0700 Subject: nvme-fc: avoid workqueue flush stalls There's no need to wait for the full nvme_wq, which is now shared, to flush. flush only the delete_work item. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b600c07803cf3..50cc17e99475e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2693,7 +2693,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_fc_del_ctrl(ctrl); if (!ret) - flush_workqueue(nvme_wq); + flush_work(&ctrl->delete_work); nvme_put_ctrl(&ctrl->ctrl); return ret; -- cgit v1.2.3 From 29c09648734b94e42802f021658a47361169403b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:28 +0200 Subject: nvme-fc: merge __nvme_fc_schedule_delete_work into __nvme_fc_del_ctrl No need to have two functions doing the same thing. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 50cc17e99475e..827e9efbe5564 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2663,22 +2663,14 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } -static bool -__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return true; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return true; - - return false; -} - static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) { - return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->delete_work)) + return -EBUSY; + return 0; } /* @@ -2724,7 +2716,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); - WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); + WARN_ON(__nvme_fc_del_ctrl(ctrl)); } } -- cgit v1.2.3 From c5017e85705bfea721732e153305d1988ff965c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:29 +0200 Subject: nvme: move controller deletion to common code Move the ->delete_work and the associated helpers to common code instead of duplicating them in every driver. This also adds the missing reference get/put for the loop driver. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 38 ++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/fc.c | 42 +++++---------------------------------- drivers/nvme/host/nvme.h | 5 ++++- drivers/nvme/host/rdma.c | 45 ++++++------------------------------------ drivers/nvme/target/loop.c | 48 +++++++++------------------------------------ 6 files changed, 62 insertions(+), 118 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index df525ab42fcd1..d835ac05bbf7b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -97,6 +97,41 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) return ret; } +static void nvme_delete_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, delete_work); + + ctrl->ops->delete_ctrl(ctrl); +} + +int nvme_delete_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->delete_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_delete_ctrl); + +int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret = 0; + + /* + * Keep a reference until the work is flushed since ->delete_ctrl + * can free the controller. + */ + nvme_get_ctrl(ctrl); + ret = nvme_delete_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->delete_work); + nvme_put_ctrl(ctrl); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); + static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { @@ -2122,7 +2157,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); if (device_remove_file_self(dev, attr)) - ctrl->ops->delete_ctrl(ctrl); + nvme_delete_ctrl_sync(ctrl); return count; } static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); @@ -2702,6 +2737,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->scan_work, nvme_scan_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); + INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4a83137b02684..a4967de5bb255 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -887,7 +887,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) "controller returned incorrect NQN: \"%s\".\n", ctrl->subnqn); up_read(&nvmf_transports_rwsem); - ctrl->ops->delete_ctrl(ctrl); + nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 827e9efbe5564..a7bdb17de29d4 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -157,7 +157,6 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set admin_tag_set; struct blk_mq_tag_set tag_set; - struct work_struct delete_work; struct delayed_work connect_work; struct kref ref; @@ -223,7 +222,6 @@ static struct device *fc_udev_device; /* *********************** FC-NVME Port Management ************************ */ -static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *); static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, struct nvme_fc_queue *, unsigned int); @@ -662,7 +660,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) /* tear down all associations to the remote port */ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) - __nvme_fc_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); spin_unlock_irqrestore(&rport->lock, flags); @@ -2636,10 +2634,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) } static void -nvme_fc_delete_ctrl_work(struct work_struct *work) +nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { - struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, delete_work); + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); @@ -2663,34 +2660,6 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } -static int -__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - return 0; -} - -/* - * Request from nvme core layer to delete the controller - */ -static int -nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - int ret; - - nvme_get_ctrl(&ctrl->ctrl); - ret = __nvme_fc_del_ctrl(ctrl); - if (!ret) - flush_work(&ctrl->delete_work); - nvme_put_ctrl(&ctrl->ctrl); - - return ret; -} - static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { @@ -2716,7 +2685,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); - WARN_ON(__nvme_fc_del_ctrl(ctrl)); + WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); } } @@ -2748,7 +2717,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_fc_nvme_ctrl_freed, .submit_async_event = nvme_fc_submit_async_event, - .delete_ctrl = nvme_fc_del_nvme_ctrl, + .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, .reinit_request = nvme_fc_reinit_request, }; @@ -2851,7 +2820,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, get_device(ctrl->dev); kref_init(&ctrl->ref); - INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); spin_lock_init(&ctrl->lock); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1bb2bc165e543..35d9cee515f1b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -138,6 +138,7 @@ struct nvme_ctrl { struct cdev cdev; struct ida ns_ida; struct work_struct reset_work; + struct work_struct delete_work; struct opal_dev *opal_dev; @@ -236,7 +237,7 @@ struct nvme_ctrl_ops { int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); - int (*delete_ctrl)(struct nvme_ctrl *ctrl); + void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*reinit_request)(void *data, struct request *rq); }; @@ -339,6 +340,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_delete_ctrl(struct nvme_ctrl *ctrl); +int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 32b0a9ef26e67..5175b465997d0 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -105,7 +105,6 @@ struct nvme_rdma_ctrl { /* other member variables */ struct blk_mq_tag_set tag_set; - struct work_struct delete_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -913,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_wq, &ctrl->delete_work); + queue_work(nvme_wq, &ctrl->ctrl.delete_work); } } @@ -1764,41 +1763,10 @@ static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_put_ctrl(&ctrl->ctrl); } -static void nvme_rdma_del_ctrl_work(struct work_struct *work) +static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, delete_work); - - nvme_stop_ctrl(&ctrl->ctrl); - nvme_rdma_remove_ctrl(ctrl); -} - -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; -} - -static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - int ret = 0; - - /* - * Keep a reference until all work is flushed since - * __nvme_rdma_del_ctrl can free the ctrl mem - */ - nvme_get_ctrl(&ctrl->ctrl); - ret = __nvme_rdma_del_ctrl(ctrl); - if (!ret) - flush_work(&ctrl->delete_work); - nvme_put_ctrl(&ctrl->ctrl); - return ret; + nvme_stop_ctrl(ctrl); + nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl)); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1846,7 +1814,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, - .delete_ctrl = nvme_rdma_del_ctrl, + .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, .reinit_request = nvme_rdma_reinit_request, }; @@ -1977,7 +1945,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); - INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ @@ -2081,7 +2048,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) dev_info(ctrl->ctrl.device, "Removing ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - __nvme_rdma_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_rdma_ctrl_mutex); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f83e925fe64a6..7f9f3fc3fb2a8 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -53,7 +53,6 @@ struct nvme_loop_ctrl { struct nvme_ctrl ctrl; struct nvmet_ctrl *target_ctrl; - struct work_struct delete_work; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -439,41 +438,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_admin_queue(ctrl); } -static void nvme_loop_del_ctrl_work(struct work_struct *work) +static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) { - struct nvme_loop_ctrl *ctrl = container_of(work, - struct nvme_loop_ctrl, delete_work); - - nvme_stop_ctrl(&ctrl->ctrl); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_loop_shutdown_ctrl(ctrl); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); -} - -static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; -} - -static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); - int ret; - - ret = __nvme_loop_del_ctrl(ctrl); - if (ret) - return ret; - - flush_work(&ctrl->delete_work); - - return 0; + nvme_stop_ctrl(ctrl); + nvme_remove_namespaces(ctrl); + nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) @@ -483,7 +454,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) mutex_lock(&nvme_loop_ctrl_mutex); list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { if (ctrl->ctrl.cntlid == nctrl->cntlid) - __nvme_loop_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_loop_ctrl_mutex); } @@ -539,7 +510,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, - .delete_ctrl = nvme_loop_del_ctrl, + .delete_ctrl = nvme_loop_delete_ctrl_host, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -601,7 +572,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); - INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, @@ -731,7 +701,7 @@ static void __exit nvme_loop_cleanup_module(void) mutex_lock(&nvme_loop_ctrl_mutex); list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) - __nvme_loop_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); flush_workqueue(nvme_wq); -- cgit v1.2.3 From e9bc25874c0bde47b65c58ccd01e339a603a7f40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:30 +0200 Subject: nvme-rdma: remove nvme_rdma_remove_ctrl It is only used in two places, and some of the work done by it will be taken into common code soon. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5175b465997d0..a3521b852ea85 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1755,18 +1755,13 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_destroy_admin_queue(ctrl, shutdown); } -static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) -{ - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl, true); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); -} - static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_ctrl(ctrl); - nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl)); + nvme_remove_namespaces(ctrl); + nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1802,7 +1797,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) out_fail: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_rdma_remove_ctrl(ctrl); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl, true); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { -- cgit v1.2.3 From 6cd53d14aaa006b5543f06fbf5e1680ce61c6c6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:31 +0200 Subject: nvme: consolidate common code from ->reset_work No change in behavior except that the FC code cancels two work items a little later now. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/fc.c | 13 ------------- drivers/nvme/host/rdma.c | 4 ---- drivers/nvme/target/loop.c | 4 ---- 4 files changed, 4 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d835ac05bbf7b..4fa748c9a3f6d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,7 +102,11 @@ static void nvme_delete_ctrl_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, delete_work); + nvme_stop_ctrl(ctrl); + nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } int nvme_delete_ctrl(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a7bdb17de29d4..e447b532b9ee8 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2640,24 +2640,11 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); - nvme_stop_ctrl(&ctrl->ctrl); - nvme_remove_namespaces(&ctrl->ctrl); /* * kill the association on the link side. this will block * waiting for io to terminate */ nvme_fc_delete_association(ctrl); - - /* - * tear down the controller - * After the last reference on the nvme ctrl is removed, - * the transport nvme_fc_nvme_ctrl_freed() callback will be - * invoked. From there, the transport will tear down it's - * logical queues and association. - */ - nvme_uninit_ctrl(&ctrl->ctrl); - - nvme_put_ctrl(&ctrl->ctrl); } static void diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a3521b852ea85..ed6e05018a922 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1757,11 +1757,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { - nvme_stop_ctrl(ctrl); - nvme_remove_namespaces(ctrl); nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); - nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 7f9f3fc3fb2a8..bc95c6ed531af 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -440,11 +440,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) { - nvme_stop_ctrl(ctrl); - nvme_remove_namespaces(ctrl); nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); - nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) -- cgit v1.2.3 From 12fa1304badde26c5cf495d5af48df22a31c3a52 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 29 Oct 2017 14:21:01 +0200 Subject: nvme-rdma: reuse nvme_delete_ctrl when reconnect attempts expire instead of just queueing delete work Reported-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ed6e05018a922..03644ecf68d29 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -912,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_wq, &ctrl->ctrl.delete_work); + nvme_delete_ctrl(&ctrl->ctrl); } } -- cgit v1.2.3 From 4054637c9b4fbe9feef0cf6f2516ef00d8053560 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 29 Oct 2017 14:21:02 +0200 Subject: nvme: flush reset_work before safely continuing with delete operation Prevent racing controller reset and delete flows. reset_work must not ever self-requeue so flushing it suffices. Reported-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/fc.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4fa748c9a3f6d..003314eb6341a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,6 +102,7 @@ static void nvme_delete_ctrl_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, delete_work); + flush_work(&ctrl->reset_work); nvme_stop_ctrl(ctrl); nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e447b532b9ee8..6a025a8d8c32c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2638,7 +2638,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block -- cgit v1.2.3 From 44c6ec77e12c387aaba420b30a54b94966f0d9e8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:14 -0700 Subject: nvme-fc: change ctlr state assignments during reset/reconnect Clean up some of the controller state checks and add the RESETTING->RECONNECTING state transition. Specifically: - the movement of the RESETTING state change and schedule of reset_work to core doesn't work wiht nvme_fc_error_recovery setting state to RECONNECTING before attempting to reset. Remove the state change as the reset request does it. - In the rare cases where an error occurs right as we're transitioning to LIVE, defer the controller start actions. - In error handling on teardown of associations while performing initial controller creation - avoid quiesce calls on the admin_q. They are unneeded. - Add the RESETTING->RECONNECTING transition in the reset handler. Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6a025a8d8c32c..e37c69f7921d3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1884,13 +1884,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: error_recovery: Couldn't change state " - "to RECONNECTING\n", ctrl->cnum); - return; - } - nvme_reset_ctrl(&ctrl->ctrl); } @@ -2528,11 +2521,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); ctrl->ctrl.nr_reconnects = 0; - nvme_start_ctrl(&ctrl->ctrl); + if (changed) + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ @@ -2600,7 +2593,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + if (ctrl->ctrl.state != NVME_CTRL_NEW) + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2649,12 +2643,8 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { - /* If we are resetting/deleting then do nothing */ - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { - WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || - ctrl->ctrl.state == NVME_CTRL_LIVE); + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) return; - } dev_info(ctrl->ctrl.device, "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", @@ -2683,9 +2673,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) int ret; nvme_stop_ctrl(&ctrl->ctrl); + /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: error_recovery: Couldn't change state " + "to RECONNECTING\n", ctrl->cnum); + return; + } + ret = nvme_fc_create_association(ctrl); if (ret) nvme_fc_reconnect_or_delete(ctrl, ret); -- cgit v1.2.3 From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:15 -0700 Subject: nvme-fc: add a dev_loss_tmo field to the remoteport Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device connectivity loss. The transport initializes the value in the nvme_fc_register_remoteport() call. If the value is not set, a default of 60s is set. Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine, which allows the lldd to dynamically update the value on an existing remoteport. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 31 +++++++++++++++++++++++++++++++ include/linux/nvme-fc-driver.h | 11 +++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e37c69f7921d3..25479d3031fad 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -45,6 +45,8 @@ enum nvme_fc_queue_flags { #define NVMEFC_QUEUE_DELAY 3 /* ms units */ +#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ + struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; struct device *dev; @@ -585,6 +587,11 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, newrec->remoteport.port_id = pinfo->port_id; newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; newrec->remoteport.port_num = idx; + /* a registration value of dev_loss_tmo=0 results in the default */ + if (pinfo->dev_loss_tmo) + newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; + else + newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; spin_lock_irqsave(&nvme_fc_lock, flags); list_add_tail(&newrec->endp_list, &lport->endp_list); @@ -688,6 +695,30 @@ nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) } EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); +int +nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, + u32 dev_loss_tmo) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + if (portptr->port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + return -EINVAL; + } + + /* a dev_loss_tmo of 0 (immediate) is allowed to be set */ + rport->remoteport.dev_loss_tmo = dev_loss_tmo; + + spin_unlock_irqrestore(&rport->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); + /* *********************** FC-NVME DMA Handling **************************** */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2be4db3539379..496ff759f84c6 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -40,6 +40,8 @@ * @node_name: FC WWNN for the port * @port_name: FC WWPN for the port * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. Used only on a remoteport. * * Initialization values for dynamic port fields: * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must @@ -50,6 +52,7 @@ struct nvme_fc_port_info { u64 port_name; u32 port_role; u32 port_id; + u32 dev_loss_tmo; }; @@ -200,6 +203,9 @@ enum nvme_fc_obj_state { * The length of the buffer corresponds to the local_priv_sz * value specified in the nvme_fc_port_template supplied by * the LLDD. + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. To modify, lldd must call + * nvme_fc_set_remoteport_devloss(). * * Fields with dynamic values. Values may change base on link state. LLDD * may reference fields directly to change them. Initialized by the @@ -257,10 +263,9 @@ struct nvme_fc_remote_port { u32 port_role; u64 node_name; u64 port_name; - struct nvme_fc_local_port *localport; - void *private; + u32 dev_loss_tmo; /* dynamic fields */ u32 port_id; @@ -446,6 +451,8 @@ int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); +int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport, + u32 dev_loss_tmo); /* -- cgit v1.2.3 From 96e24801056467c6483f68ef71d642fb925c3100 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:16 -0700 Subject: nvme-fc: check connectivity before initiating reconnects Check remoteport connectivity before initiating reconnects Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 25479d3031fad..b64cc10024520 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -808,7 +808,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, dma_unmap_sg(dev, sg, nents, dir); } - /* *********************** FC-NVME LS Handling **************************** */ static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); @@ -2462,6 +2461,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ++ctrl->ctrl.nr_reconnects; + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return -ENODEV; + /* * Create the admin queue */ @@ -2682,6 +2684,10 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) ctrl->cnum, status); if (nvmf_should_reconnect(&ctrl->ctrl)) { + /* Only schedule the reconnect if the remote port is online */ + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return; + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); @@ -2715,12 +2721,15 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_fc_create_association(ctrl); - if (ret) - nvme_fc_reconnect_or_delete(ctrl, ret); - else - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + ret = nvme_fc_create_association(ctrl); + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller reset complete\n", + ctrl->cnum); + } } static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { -- cgit v1.2.3 From 3cec7f9de448131778a1428100621fd371db75f4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:13 -0700 Subject: nvme: allow controller RESETTING to RECONNECTING transition Transport will typically transition from LIVE to RESETTING when initially performing a reset or recovering from an error. Adding this transition allows a transport to transition to RECONNECTING when it checks/waits for connectivity then creates new transport connections and reinits the controller. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 003314eb6341a..07b9f4e1c283d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -241,6 +241,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RECONNECTING: switch (old_state) { case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: changed = true; /* FALLTHRU */ default: -- cgit v1.2.3 From 2b632970da4f288e6dbdc826c34fbf74f40ec94c Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:17 -0700 Subject: nvme-fc: add dev_loss_tmo timeout and remoteport resume support When a remoteport is unregistered (connectivity lost), the following actions are taken: - the remoteport is marked DELETED - the time when dev_loss_tmo would expire is set in the remoteport - all controllers on the remoteport are reset. After a controller resets, it will stall in a RECONNECTING state waiting for one of the following: - the controller will continue to attempt reconnect per max_retries and reconnect_delay. As no remoteport connectivity, the reconnect attempt will immediately fail. If max reconnects has not been reached, a new reconnect_delay timer will be schedule. If the current time plus another reconnect_delay exceeds when dev_loss_tmo expires on the remote port, then the reconnect_delay will be shortend to schedule no later than when dev_loss_tmo expires. If max reconnect attempts are reached (e.g. ctrl_loss_tmo reached) or dev_loss_tmo ix exceeded without connectivity, the controller is deleted. - the remoteport is re-registered prior to dev_loss_tmo expiring. The resume of the remoteport will immediately attempt to reconnect each of its suspended controllers. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke [hch: updated to use nvme_delete_ctrl] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 278 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 239 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b64cc10024520..113c30be7276b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -138,6 +138,7 @@ struct nvme_fc_rport { struct nvme_fc_lport *lport; spinlock_t lock; struct kref ref; + unsigned long dev_loss_end; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ enum nvme_fcctrl_flags { @@ -528,6 +529,102 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport) return kref_get_unless_zero(&rport->ref); } +static void +nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) +{ + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_RECONNECTING: + /* + * As all reconnects were suppressed, schedule a + * connect. + */ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: connectivity re-established. " + "Attempting reconnect\n", ctrl->cnum); + + queue_delayed_work(nvme_wq, &ctrl->connect_work, 0); + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will naturally occur after the reset completes. + */ + break; + + default: + /* no action to take - let it delete */ + break; + } +} + +static struct nvme_fc_rport * +nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport, + struct nvme_fc_port_info *pinfo) +{ + struct nvme_fc_rport *rport; + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(rport, &lport->endp_list, endp_list) { + if (rport->remoteport.node_name != pinfo->node_name || + rport->remoteport.port_name != pinfo->port_name) + continue; + + if (!nvme_fc_rport_get(rport)) { + rport = ERR_PTR(-ENOLCK); + goto out_done; + } + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + spin_lock_irqsave(&rport->lock, flags); + + /* has it been unregistered */ + if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) { + /* means lldd called us twice */ + spin_unlock_irqrestore(&rport->lock, flags); + nvme_fc_rport_put(rport); + return ERR_PTR(-ESTALE); + } + + rport->remoteport.port_state = FC_OBJSTATE_ONLINE; + rport->dev_loss_end = 0; + + /* + * kick off a reconnect attempt on all associations to the + * remote port. A successful reconnects will resume i/o. + */ + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) + nvme_fc_resume_controller(ctrl); + + spin_unlock_irqrestore(&rport->lock, flags); + + return rport; + } + + rport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return rport; +} + +static inline void +__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport, + struct nvme_fc_port_info *pinfo) +{ + if (pinfo->dev_loss_tmo) + rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; + else + rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -554,22 +651,45 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, unsigned long flags; int ret, idx; + if (!nvme_fc_lport_get(lport)) { + ret = -ESHUTDOWN; + goto out_reghost_failed; + } + + /* + * look to see if there is already a remoteport that is waiting + * for a reconnect (within dev_loss_tmo) with the same WWN's. + * If so, transition to it and reconnect. + */ + newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo); + + /* found an rport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_lport_put; + + /* found existing rport, which was resumed */ + } else if (newrec) { + nvme_fc_lport_put(lport); + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); + nvme_fc_signal_discovery_scan(lport, newrec); + *portptr = &newrec->remoteport; + return 0; + } + + /* nothing found - allocate a new remoteport struct */ + newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), GFP_KERNEL); if (!newrec) { ret = -ENOMEM; - goto out_reghost_failed; - } - - if (!nvme_fc_lport_get(lport)) { - ret = -ESHUTDOWN; - goto out_kfree_rport; + goto out_lport_put; } idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; - goto out_lport_put; + goto out_kfree_rport; } INIT_LIST_HEAD(&newrec->endp_list); @@ -587,11 +707,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, newrec->remoteport.port_id = pinfo->port_id; newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; newrec->remoteport.port_num = idx; - /* a registration value of dev_loss_tmo=0 results in the default */ - if (pinfo->dev_loss_tmo) - newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; - else - newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); spin_lock_irqsave(&nvme_fc_lock, flags); list_add_tail(&newrec->endp_list, &lport->endp_list); @@ -602,10 +718,10 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, *portptr = &newrec->remoteport; return 0; -out_lport_put: - nvme_fc_lport_put(lport); out_kfree_rport: kfree(newrec); +out_lport_put: + nvme_fc_lport_put(lport); out_reghost_failed: *portptr = NULL; return ret; @@ -636,6 +752,58 @@ restart: return 0; } +static void +nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) +{ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost. Awaiting " + "Reconnect", ctrl->cnum); + + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + /* + * Schedule a controller reset. The reset will terminate the + * association and schedule the reconnect timer. Reconnects + * will be attempted until either the ctlr_loss_tmo + * (max_retries * connect_delay) expires or the remoteport's + * dev_loss_tmo expires. + */ + if (nvme_reset_ctrl(&ctrl->ctrl)) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Couldn't schedule reset. " + "Deleting controller.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + break; + + case NVME_CTRL_RECONNECTING: + /* + * The association has already been terminated and the + * controller is attempting reconnects. No need to do anything + * futher. Reconnects will be attempted until either the + * ctlr_loss_tmo (max_retries * connect_delay) expires or the + * remoteport's dev_loss_tmo expires. + */ + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will kick in naturally after the association is + * terminated. + */ + break; + + case NVME_CTRL_DELETING: + default: + /* no action to take - let it delete */ + break; + } +} + /** * nvme_fc_unregister_remoteport - transport entry point called by an * LLDD to deregister/remove a previously @@ -665,15 +833,31 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) } portptr->port_state = FC_OBJSTATE_DELETED; - /* tear down all associations to the remote port */ - list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) - nvme_delete_ctrl(&ctrl->ctrl); + rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ); + + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + /* if dev_loss_tmo==0, dev loss is immediate */ + if (!portptr->dev_loss_tmo) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost. " + "Deleting controller.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } else + nvme_fc_ctrl_connectivity_loss(ctrl); + } spin_unlock_irqrestore(&rport->lock, flags); nvme_fc_abort_lsops(rport); + /* + * release the reference, which will allow, if all controllers + * go away, which should only occur after dev_loss_tmo occurs, + * for the rport to be torn down. + */ nvme_fc_rport_put(rport); + return 0; } EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); @@ -700,7 +884,6 @@ nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, u32 dev_loss_tmo) { struct nvme_fc_rport *rport = remoteport_to_rport(portptr); - struct nvme_fc_ctrl *ctrl; unsigned long flags; spin_lock_irqsave(&rport->lock, flags); @@ -2676,28 +2859,43 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_remote_port *portptr = &rport->remoteport; + unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; + bool recon = true; + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) return; - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", - ctrl->cnum, status); + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + else if (time_after_eq(jiffies, rport->dev_loss_end)) + recon = false; - if (nvmf_should_reconnect(&ctrl->ctrl)) { - /* Only schedule the reconnect if the remote port is online */ - if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return; + if (recon && nvmf_should_reconnect(&ctrl->ctrl)) { + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %ld " + "seconds\n", + ctrl->cnum, recon_delay / HZ); + else if (time_after(jiffies + recon_delay, rport->dev_loss_end)) + recon_delay = rport->dev_loss_end - jiffies; - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); + queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); } else { - dev_warn(ctrl->ctrl.device, + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); + else + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: dev_loss_tmo (%d) expired " + "while waiting for remoteport connectivity. " + "Removing controller\n", ctrl->cnum, + portptr->dev_loss_tmo); WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); } } @@ -2721,15 +2919,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) return; } - if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) ret = nvme_fc_create_association(ctrl); - if (ret) - nvme_fc_reconnect_or_delete(ctrl, ret); - else - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reset complete\n", - ctrl->cnum); - } + else + ret = -ENOTCONN; + + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller reset complete\n", + ctrl->cnum); } static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { -- cgit v1.2.3 From a96d4bd867129e6124e3cd1006fcbcfea3341179 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Oct 2017 13:12:49 -0700 Subject: nvmet: fix fatal_err_work deadlock Below is a stack trace for an issue that was reported. What's happening is that the nvmet layer had it's controller kato timeout fire, which causes it to schedule its fatal error handler via the fatal_err_work element. The error handler is invoked, which calls the transport delete_ctrl() entry point, and as the transport tears down the controller, nvmet_sq_destroy ends up doing the final put on the ctlr causing it to enter its free routine. The ctlr free routine does a cancel_work_sync() on fatal_err_work element, which then does a flush_work and wait_for_completion. But, as the wait is in the context of the work element being flushed, its in a catch-22 and the thread hangs. [ 326.903131] nvmet: ctrl 1 keep-alive timer (15 seconds) expired! [ 326.909832] nvmet: ctrl 1 fatal error occurred! [ 327.643100] lpfc 0000:04:00.0: 0:6313 NVMET Defer ctx release xri x114 flg x2 [ 494.582064] INFO: task kworker/0:2:243 blocked for more than 120 seconds. [ 494.589638] Not tainted 4.14.0-rc1.James+ #1 [ 494.594986] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 494.603718] kworker/0:2 D 0 243 2 0x80000000 [ 494.609839] Workqueue: events nvmet_fatal_error_handler [nvmet] [ 494.616447] Call Trace: [ 494.619177] __schedule+0x28d/0x890 [ 494.623070] schedule+0x36/0x80 [ 494.626571] schedule_timeout+0x1dd/0x300 [ 494.631044] ? dequeue_task_fair+0x592/0x840 [ 494.635810] ? pick_next_task_fair+0x23b/0x5c0 [ 494.640756] wait_for_completion+0x121/0x180 [ 494.645521] ? wake_up_q+0x80/0x80 [ 494.649315] flush_work+0x11d/0x1a0 [ 494.653206] ? wake_up_worker+0x30/0x30 [ 494.657484] __cancel_work_timer+0x10b/0x190 [ 494.662249] cancel_work_sync+0x10/0x20 [ 494.666525] nvmet_ctrl_put+0xa3/0x100 [nvmet] [ 494.671482] nvmet_sq_:q+0x64/0xd0 [nvmet] [ 494.676540] nvmet_fc_delete_target_queue+0x202/0x220 [nvmet_fc] [ 494.683245] nvmet_fc_delete_target_assoc+0x6d/0xc0 [nvmet_fc] [ 494.689743] nvmet_fc_delete_ctrl+0x137/0x1a0 [nvmet_fc] [ 494.695673] nvmet_fatal_error_handler+0x30/0x40 [nvmet] [ 494.701589] process_one_work+0x149/0x360 [ 494.706064] worker_thread+0x4d/0x3c0 [ 494.710148] kthread+0x109/0x140 [ 494.713751] ? rescuer_thread+0x380/0x380 [ 494.718214] ? kthread_park+0x60/0x60 Correct by having the fc transport convert to a different workq context for the actual controller teardown which may call the cancel_work_sync. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 58e010bdda3ea..5f2570f5dfa90 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -150,6 +150,7 @@ struct nvmet_fc_tgt_assoc { struct list_head a_list; struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; + struct work_struct del_work; }; @@ -232,6 +233,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod); +static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc); /* *********************** FC-NVME DMA Handling **************************** */ @@ -802,6 +804,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, return NULL; } +static void +nvmet_fc_delete_assoc(struct work_struct *work) +{ + struct nvmet_fc_tgt_assoc *assoc = + container_of(work, struct nvmet_fc_tgt_assoc, del_work); + + nvmet_fc_delete_target_assoc(assoc); + nvmet_fc_tgt_a_put(assoc); +} + static struct nvmet_fc_tgt_assoc * nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) { @@ -826,6 +838,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) assoc->a_id = idx; INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); + INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); while (needrandom) { get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); @@ -1118,8 +1131,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); + schedule_work(&assoc->del_work); return; } -- cgit v1.2.3 From 3639efef8fb128f99ae38bf603189639efc3850d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 20 Oct 2017 16:18:03 -0600 Subject: nvme: Remove unused headers Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7735571ffc9ab..32c413ec818c9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include @@ -26,12 +25,9 @@ #include #include #include -#include #include -#include #include #include -#include #include #include "nvme.h" -- cgit v1.2.3 From c091fbe9a26ee116d99e2c0ed010afb957a10365 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:19:32 +0100 Subject: block: fix CDROM dependency on BLK_DEV After the cdrom cleanup, I get randconfig warnings for some configurations: warning: (BLK_DEV_IDECD && BLK_DEV_SR) selects CDROM which has unmet direct dependencies (BLK_DEV) This adds an explicit BLK_DEV dependency for both drivers. The other drivers that select 'CDROM' already have this and don't need a change. Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/ide/Kconfig | 1 + drivers/scsi/Kconfig | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 7b92c591e4c11..cf1fb3fb5d26f 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -117,6 +117,7 @@ config BLK_DEV_DELKIN config BLK_DEV_IDECD tristate "Include IDE/ATAPI CDROM support" + depends on BLK_DEV select IDE_ATAPI select CDROM ---help--- diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index cca0b03c2eadf..7669553180058 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -130,7 +130,7 @@ config CHR_DEV_OSST config BLK_DEV_SR tristate "SCSI CDROM support" - depends on SCSI + depends on SCSI && BLK_DEV select CDROM ---help--- If you want to use a CD or DVD drive attached to your computer -- cgit v1.2.3 From 474f5da2354e9fd5376b968e4c5a0f1807dc19e8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:42:00 +0100 Subject: skd: use ktime_get_real_seconds() Like many storage drivers, skd uses an unsigned 32-bit number for interchanging the current time with the firmware. This will overflow in y2106 and is otherwise safe. However, the get_seconds() function is generally considered deprecated since the behavior is different between 32-bit and 64-bit architectures, and using it may indicate a bigger problem. To annotate that we've thought about this, let's add a comment here and migrate to the ktime_get_real_seconds() function that consistently returns a 64-bit number. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 7cedb4295e9d3..802ab9f7a8c11 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) break; case FIT_MTD_CMD_LOG_HOST_ID: - skdev->connect_time_stamp = get_seconds(); + /* hardware interface overflows in y2106 */ + skdev->connect_time_stamp = (u32)ktime_get_real_seconds(); data = skdev->connect_time_stamp & 0xFFFF; mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); -- cgit v1.2.3 From a806c6c81e6c0d07c8a8b2643bad4a37a196d681 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 2 Nov 2017 18:07:44 +0900 Subject: nvme: comment typo fixed in clearing AER a tiny typo fixed in a comment. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07b9f4e1c283d..64744355aa88e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2637,7 +2637,7 @@ static void nvme_fw_act_work(struct work_struct *work) return; nvme_start_queues(ctrl); - /* read FW slot informationi to clear the AER*/ + /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); } -- cgit v1.2.3 From 8977f563845bdd61fc61c4dfb399270b7d5667c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:48 +0300 Subject: block: move REQ_NOWAIT This flag should be before the operation-specific REQ_NOUNMAP bit. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3385c89f402e2..c4c6c8bced2e1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -224,11 +224,11 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + __REQ_NOWAIT, /* Don't wait if request will block */ /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -245,9 +245,9 @@ enum req_flag_bits { #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From 96222bcc732d0504363dc772637c50e53b4bd41e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:49 +0300 Subject: block: add REQ_DRV bit Set aside a bit in the request/bio flags for driver use. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c4c6c8bced2e1..1b04085255fb3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -229,6 +229,9 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + /* for driver use */ + __REQ_DRV, + __REQ_NR_BITS, /* stops here */ }; @@ -249,6 +252,8 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_DRV (1ULL << __REQ_DRV) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:50 +0300 Subject: block: provide a direct_make_request helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper allows reinserting a bio into a new queue without much overhead, but requires all queue limits to be the same for the upper and lower queues, and it does not provide any recursion preventions. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 35 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index bb4fce694a60c..96aaa3a419a37 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2243,6 +2243,40 @@ out: } EXPORT_SYMBOL(generic_make_request); +/** + * direct_make_request - hand a buffer directly to its device driver for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This function behaves like generic_make_request(), but does not protect + * against recursion. Must only be used if the called driver is known + * to not call generic_make_request (or direct_make_request) again from + * its make_request function. (Calling direct_make_request again from + * a workqueue is perfectly fine as that doesn't recurse). + */ +blk_qc_t direct_make_request(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + bool nowait = bio->bi_opf & REQ_NOWAIT; + blk_qc_t ret; + + if (!generic_make_request_checks(bio)) + return BLK_QC_T_NONE; + + if (unlikely(blk_queue_enter(q, nowait))) { + if (nowait && !blk_queue_dying(q)) + bio->bi_status = BLK_STS_AGAIN; + else + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + + ret = q->make_request_fn(q, bio); + blk_queue_exit(q); + return ret; +} +EXPORT_SYMBOL_GPL(direct_make_request); + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72637028f3c9f..eda3d25c0f689 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); +extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); -- cgit v1.2.3 From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:51 +0300 Subject: block: add a blk_steal_bios helper This helpers allows to bounce steal the uncompleted bios from a request so that they can be reissued on another path. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 21 +++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 23 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 96aaa3a419a37..68cfe6780a9b7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2769,6 +2769,27 @@ struct request *blk_fetch_request(struct request_queue *q) } EXPORT_SYMBOL(blk_fetch_request); +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eda3d25c0f689..fddda6a1f9b58 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q); extern void blk_start_request(struct request *rq); extern struct request *blk_fetch_request(struct request_queue *q); +void blk_steal_bios(struct bio_list *list, struct request *rq); + /* * Request completion related functions. * -- cgit v1.2.3 From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:52 +0300 Subject: block: don't look at the struct device dev_t in disk_devt The hidden gendisks introduced in the next patch need to keep the dev field in their struct device empty so that udev won't try to create block device nodes for them. To support that rewrite disk_devt to look at the major and first_minor fields in the gendisk itself instead of looking into the struct device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 4 ---- include/linux/genhd.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index dd305c65ffb05..1174d24e405ee 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -649,10 +649,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) return; } disk_to_dev(disk)->devt = devt; - - /* ->major and ->first_minor aren't supposed to be - * dereferenced from here on, but set them just in case. - */ disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ea652bfcd6756..5c0ed5db33c2d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) static inline dev_t disk_devt(struct gendisk *disk) { - return disk_to_dev(disk)->devt; + return MKDEV(disk->major, disk->first_minor); } static inline dev_t part_devt(struct hd_struct *part) -- cgit v1.2.3 From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:53 +0300 Subject: block: introduce GENHD_FL_HIDDEN With this flag a driver can create a gendisk that can be used for I/O submission inside the kernel, but which is not registered as user facing block device. This will be useful for the NVMe multipath implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 68 +++++++++++++++++++++++++++++++++++++-------------- include/linux/genhd.h | 1 + 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 1174d24e405ee..835e907e6e015 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,11 @@ static void register_disk(struct device *parent, struct gendisk *disk) */ pm_runtime_set_memalloc_noio(ddev, true); + if (disk->flags & GENHD_FL_HIDDEN) { + dev_set_uevent_suppress(ddev, 0); + return; + } + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -616,6 +621,11 @@ exit: while ((part = disk_part_iter_next(&piter))) kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); disk_part_iter_exit(&piter); + + err = sysfs_create_link(&ddev->kobj, + &disk->queue->backing_dev_info->dev->kobj, + "bdi"); + WARN_ON(err); } /** @@ -630,7 +640,6 @@ exit: */ void device_add_disk(struct device *parent, struct gendisk *disk) { - struct backing_dev_info *bdi; dev_t devt; int retval; @@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) * parameters make sense. */ WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT)); + WARN_ON(!disk->minors && + !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); disk->flags |= GENHD_FL_UP; @@ -648,18 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk) WARN_ON(1); return; } - disk_to_dev(disk)->devt = devt; disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); disk_alloc_events(disk); - /* Register BDI before referencing it from bdev */ - bdi = disk->queue->backing_dev_info; - bdi_register_owner(bdi, disk_to_dev(disk)); - - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + /* Register BDI before referencing it from bdev */ + disk_to_dev(disk)->devt = devt; + bdi_register_owner(disk->queue->backing_dev_info, + disk_to_dev(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); + } register_disk(parent, disk); blk_register_queue(disk); @@ -669,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk) */ WARN_ON_ONCE(!blk_get_queue(disk->queue)); - retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, - "bdi"); - WARN_ON(retval); - disk_add_events(disk); blk_integrity_add(disk); } @@ -701,7 +715,8 @@ void del_gendisk(struct gendisk *disk) set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + if (!(disk->flags & GENHD_FL_HIDDEN)) + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); if (disk->queue) { /* * Unregister bdi before releasing device numbers (as they can @@ -712,13 +727,15 @@ void del_gendisk(struct gendisk *disk) } else { WARN_ON(1); } - blk_unregister_region(disk_devt(disk), disk->minors); + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + blk_unregister_region(disk_devt(disk), disk->minors); + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + } part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; - - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -781,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } + if (unlikely(disk->flags & GENHD_FL_HIDDEN)) { + put_disk(disk); + disk = NULL; + } return disk; } EXPORT_SYMBOL(get_gendisk); @@ -1024,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev, (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } +static ssize_t disk_hidden_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); +} + static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1061,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); +static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); @@ -1085,6 +1116,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, + &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5c0ed5db33c2d..93aae3476f58b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -140,6 +140,7 @@ struct hd_struct { #define GENHD_FL_NATIVE_CAPACITY 128 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 #define GENHD_FL_NO_PART_SCAN 512 +#define GENHD_FL_HIDDEN 1024 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.3 From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:54 +0300 Subject: block: add a poll_fn callback to struct request_queue That we we can also poll non blk-mq queues. Mostly needed for the NVMe multipath code, but could also be useful elsewhere. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 11 +++++++++++ block/blk-mq.c | 14 +++++--------- drivers/nvme/target/io-cmd.c | 2 +- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap.c | 2 +- include/linux/blkdev.h | 4 +++- mm/page_io.c | 2 +- 8 files changed, 25 insertions(+), 16 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 68cfe6780a9b7..395bfb10d6581 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2321,6 +2321,17 @@ blk_qc_t submit_bio(struct bio *bio) } EXPORT_SYMBOL(submit_bio); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + if (!q->poll_fn || !blk_qc_t_valid(cookie)) + return false; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + return q->poll_fn(q, cookie); +} +EXPORT_SYMBOL_GPL(blk_poll); + /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits diff --git a/block/blk-mq.c b/block/blk-mq.c index e4d2490f4e7ee..95ea5889b8252 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -37,6 +37,7 @@ #include "blk-wbt.h" #include "blk-mq-sched.h" +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -2499,6 +2500,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); + if (q->mq_ops->poll) + q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2961,20 +2964,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) return false; } -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_mq_hw_ctx *hctx; - struct blk_plug *plug; struct request *rq; - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; if (!blk_qc_t_is_internal(cookie)) rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); @@ -2992,7 +2989,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) return __blk_mq_poll(hctx, rq); } -EXPORT_SYMBOL_GPL(blk_mq_poll); static int __init blk_mq_init(void) { diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0d4c23dc45324..db632818777dd 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) cookie = submit_bio(bio); - blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); + blk_poll(bdev_get_queue(req->ns->bdev), cookie); } static void nvmet_execute_flush(struct nvmet_req *req) diff --git a/fs/block_dev.c b/fs/block_dev.c index 07ddccd178017..4afa4d5ff969e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -402,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index 62cf812ed0e58..d2bc339cb1e98 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -486,7 +486,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/iomap.c b/fs/iomap.c index 8194d30bdca08..4241bac905b19 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1049,7 +1049,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || - !blk_mq_poll(dio->submit.last_queue, + !blk_poll(dio->submit.last_queue, dio->submit.cookie)) io_schedule(); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fddda6a1f9b58..225617dd0a3f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -266,6 +266,7 @@ struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -408,6 +409,7 @@ struct request_queue { request_fn_proc *request_fn; make_request_fn *make_request_fn; + poll_q_fn *poll_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; softirq_done_fn *softirq_done_fn; @@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/mm/page_io.c b/mm/page_io.c index 21502d341a67c..ff04de630c465 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -407,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_mq_poll(disk->queue, qc)) + if (!blk_poll(disk->queue, qc)) break; } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From a116895fc7b61de4f26548c9b635a06515ec7563 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Nov 2017 11:00:03 -0600 Subject: cdrom: hide CONFIG_CDROM menu selection We don't need to expose this. The point is that drivers select the uniform CDROM layer, if they need it, the user should not have to make a conscious decision on whether to include this separately or not. Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile") Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b998b3c9ea860..95e678716b5e9 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -68,14 +68,7 @@ config AMIGA_Z2RAM module will be called z2ram. config CDROM - tristate "CD-ROM driver" - help - A CD-ROM is a pre-pressed optical compact disc which contains - data. The name is an acronym which stands for "Compact Disc - Read-Only Memory". This driver implements functionality that is - shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the - SCSI CD-ROM driver. For a list of the ioctls implemented by this - driver, see also Documentation/ioctl/cdrom.txt. + tristate config GDROM tristate "SEGA Dreamcast GD-ROM drive" -- cgit v1.2.3 From c2e82a234873d905260a3bd0aca9577e85619bda Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Fri, 22 Sep 2017 23:36:28 +0800 Subject: blk-mq: fix nr_requests wrong value when modify it from sysfs if blk-mq use "none" io scheduler, nr_request get a wrong value when input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get the smaller one min(nr, set->queue_depth), and then q->nr_request get a wrong value. Reproduce: echo none > /sys/block/nvme0n1/queue/scheduler echo 1000000 > /sys/block/nvme0n1/queue/nr_requests cat /sys/block/nvme0n1/queue/nr_requests 1000000 Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 95ea5889b8252..9eea67ce82d90 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2743,8 +2743,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * queue depth. This is similar to what the old code would do. */ if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, - min(nr, set->queue_depth), + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, -- cgit v1.2.3 From 21e768b442bb587123ea924620e74d6e5655d717 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 16 Oct 2017 16:32:26 -0700 Subject: blk-mq: Make blk_mq_get_request() error path less confusing blk_mq_get_tag() can modify data->ctx. This means that in the error path of blk_mq_get_request() data->ctx should be passed to blk_mq_put_ctx() instead of local_ctx. Note: since blk_mq_put_ctx() ignores its argument, this patch does not change any functionality. References: commit 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed") Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9eea67ce82d90..13cdccef543ce 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -337,12 +337,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; - struct blk_mq_ctx *local_ctx = NULL; + bool put_ctx_on_error = false; blk_queue_enter_live(q); data->q = q; - if (likely(!data->ctx)) - data->ctx = local_ctx = blk_mq_get_ctx(q); + if (likely(!data->ctx)) { + data->ctx = blk_mq_get_ctx(q); + put_ctx_on_error = true; + } if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (op & REQ_NOWAIT) @@ -361,8 +363,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { - if (local_ctx) { - blk_mq_put_ctx(local_ctx); + if (put_ctx_on_error) { + blk_mq_put_ctx(data->ctx); data->ctx = NULL; } blk_queue_exit(q); -- cgit v1.2.3 From e4f36b249b4d4e7599f1cf0c8fb50f196e52677e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Oct 2017 16:45:23 +0200 Subject: block: fix peeking requests during PM We need to look for an active PM request until the next softbarrier instead of looking for the first non-PM request. Otherwise any cause of request reordering might starve the PM request(s). Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 395bfb10d6581..223f32826e62d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2511,20 +2511,22 @@ void blk_account_io_done(struct request *req) * Don't process normal requests when queue is suspended * or in the process of suspending/resuming */ -static struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) - return NULL; - else - return rq; + switch (rq->q->rpm_status) { + case RPM_RESUMING: + case RPM_SUSPENDING: + return rq->rq_flags & RQF_PM; + case RPM_SUSPENDED: + return false; + } + + return true; } #else -static inline struct request *blk_pm_peek_request(struct request_queue *q, - struct request *rq) +static bool blk_pm_allow_request(struct request *rq) { - return rq; + return true; } #endif @@ -2572,9 +2574,12 @@ static struct request *elv_next_request(struct request_queue *q) WARN_ON_ONCE(q->mq_ops); while (1) { - if (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - return rq; + list_for_each_entry(rq, &q->queue_head, queuelist) { + if (blk_pm_allow_request(rq)) + return rq; + + if (rq->rq_flags & RQF_SOFTBARRIER) + break; } /* @@ -2625,10 +2630,6 @@ struct request *blk_peek_request(struct request_queue *q) WARN_ON_ONCE(q->mq_ops); while ((rq = elv_next_request(q)) != NULL) { - rq = blk_pm_peek_request(q, rq); - if (!rq) - break; - if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver -- cgit v1.2.3 From 826a70a08b1210bbfdbda812ab43eb986e25b5c2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Nov 2017 09:55:34 +0800 Subject: SCSI: don't get target/host busy_count in scsi_mq_get_budget() It is very expensive to atomic_inc/atomic_dec the host wide counter of host->busy_count, and it should have been avoided via blk-mq's mechanism of getting driver tag, which uses the more efficient way of sbitmap queue. Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget() and don't run queue if the counter becomes zero, so IO hang may be caused if all requests are completed just before the current SCSI device is added to shost->starved_list. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Reported-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6f10afaca25b2..22a7e4c47207a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1950,11 +1950,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost = sdev->host; - atomic_dec(&shost->host_busy); - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); atomic_dec(&sdev->device_busy); put_device(&sdev->sdev_gendev); } @@ -1963,7 +1959,6 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost = sdev->host; blk_status_t ret; ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); @@ -1974,18 +1969,9 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) goto out; if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; - if (!scsi_target_queue_ready(shost, sdev)) - goto out_dec_device_busy; - if (!scsi_host_queue_ready(q, shost, sdev)) - goto out_dec_target_busy; return BLK_STS_OK; -out_dec_target_busy: - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); -out_dec_device_busy: - atomic_dec(&sdev->device_busy); out_put_device: put_device(&sdev->sdev_gendev); out: @@ -1998,6 +1984,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; int reason; @@ -2007,10 +1994,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_put_budget; ret = BLK_STS_RESOURCE; + if (!scsi_target_queue_ready(shost, sdev)) + goto out_put_budget; + if (!scsi_host_queue_ready(q, shost, sdev)) + goto out_dec_target_busy; + if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret != BLK_STS_OK) - goto out_put_budget; + goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { blk_mq_start_request(req); @@ -2028,11 +2020,16 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; - goto out_put_budget; + goto out_dec_host_busy; } return BLK_STS_OK; +out_dec_host_busy: + atomic_dec(&shost->host_busy); +out_dec_target_busy: + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: scsi_mq_put_budget(hctx); switch (ret) { -- cgit v1.2.3 From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 5 Nov 2017 02:21:12 +0800 Subject: blk-mq: don't handle failure in .get_budget It is enough to just check if we can get the budget via .get_budget(). And we don't need to deal with device state change in .get_budget(). For SCSI, one issue to be fixed is that we have to call scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails to handle the request. And it isn't enough to simply call blk_mq_end_request() to do that if this request is marked as RQF_DONTPREP. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 14 ++------------ block/blk-mq.c | 17 ++++------------- block/blk-mq.h | 5 ++--- drivers/scsi/scsi_lib.c | 11 +++-------- include/linux/blk-mq.h | 2 +- 5 files changed, 12 insertions(+), 37 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7775f6b12fa94..13a27d4d1671d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -94,23 +94,18 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - blk_status_t ret; if (e->type->ops.mq.has_work && !e->type->ops.mq.has_work(hctx)) break; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) + if (!blk_mq_get_dispatch_budget(hctx)) break; rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; - } else if (ret != BLK_STS_OK) { - blk_mq_end_request(rq, ret); - continue; } /* @@ -146,22 +141,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - blk_status_t ret; if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) + if (!blk_mq_get_dispatch_budget(hctx)) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; - } else if (ret != BLK_STS_OK) { - blk_mq_end_request(rq, ret); - continue; } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 13cdccef543ce..c9fa4b294664a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1137,13 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } } - if (!got_budget) { - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) - break; - if (ret != BLK_STS_OK) - goto fail_rq; - } + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + break; list_del_init(&rq->queuelist); @@ -1170,7 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, break; } - fail_rq: if (unlikely(ret != BLK_STS_OK)) { errors++; blk_mq_end_request(rq, BLK_STS_IOERR); @@ -1642,12 +1636,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_driver_tag(rq, NULL, false)) goto insert; - ret = blk_mq_get_dispatch_budget(hctx); - if (ret == BLK_STS_RESOURCE) { + if (!blk_mq_get_dispatch_budget(hctx)) { blk_mq_put_driver_tag(rq); goto insert; - } else if (ret != BLK_STS_OK) - goto fail_rq; + } new_cookie = request_to_qc_t(hctx, rq); @@ -1665,7 +1657,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, __blk_mq_requeue_request(rq); goto insert; default: - fail_rq: *cookie = BLK_QC_T_NONE; blk_mq_end_request(rq, ret); return; diff --git a/block/blk-mq.h b/block/blk-mq.h index 522b420dedc07..f97aceff76e91 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -147,14 +147,13 @@ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) q->mq_ops->put_budget(hctx); } -static inline blk_status_t blk_mq_get_dispatch_budget( - struct blk_mq_hw_ctx *hctx) +static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; if (q->mq_ops->get_budget) return q->mq_ops->get_budget(hctx); - return BLK_STS_OK; + return true; } #endif diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 22a7e4c47207a..286ea983c9e35 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) put_device(&sdev->sdev_gendev); } -static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - blk_status_t ret; - - ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); - if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK) - return ret; if (!get_device(&sdev->sdev_gendev)) goto out; if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; - return BLK_STS_OK; + return true; out_put_device: put_device(&sdev->sdev_gendev); out: - return BLK_STS_RESOURCE; + return false; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2e3079eecdd8..674641527da79 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -92,7 +92,7 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); -typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v1.2.3 From e84010732225c4c7c3464ee1169d395751c3adfa Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Tue, 17 Oct 2017 23:56:21 +0800 Subject: blkcg: add sanity check for blkcg policy operations blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs, otherwise policy would register fail. Reviewed-by: Johannes Thumshirn Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e7ec676043b1c..4117524ca45bc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (i >= BLKCG_MAX_POLS) goto err_unlock; + /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + goto err_unlock; + /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; -- cgit v1.2.3 From 6d6f167ce74158903e7fc20dfbecf89c71aa1c00 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Thu, 2 Nov 2017 23:24:32 +0800 Subject: blk-mq: put the driver tag of nxt rq before first one is requeued When freeing the driver tag of the next rq with an I/O scheduler configured, we get the first entry of the list. However, this can race with requeue of a request, and we end up getting the wrong request from the head of the list. Free the driver tag of next rq before the failed one is requeued in the failure branch of queue_rq callback. Signed-off-by: Jianchao Wang Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c9fa4b294664a..6eacc1dea8b74 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1094,7 +1094,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { struct blk_mq_hw_ctx *hctx; - struct request *rq; + struct request *rq, *nxt; int errors, queued; if (list_empty(list)) @@ -1151,14 +1151,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (list_empty(list)) bd.last = true; else { - struct request *nxt; - nxt = list_first_entry(list, struct request, queuelist); bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); } ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE) { + /* + * If an I/O scheduler has been configured and we got a + * driver tag for the next request already, free it again. + */ + if (!list_empty(list)) { + nxt = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(nxt); + } blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); @@ -1181,13 +1187,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * that is where we will continue on next queue run. */ if (!list_empty(list)) { - /* - * If an I/O scheduler has been configured and we got a driver - * tag for the next request already, free it again. - */ - rq = list_first_entry(list, struct request, queuelist); - blk_mq_put_driver_tag(rq); - spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); -- cgit v1.2.3 From 9c71c83c857e7a84a5be5a56ea88da7098f51db8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:33 +0800 Subject: blk-flush: don't run queue for requests bypassing flush blk_insert_flush() should only insert request since run queue always follows it. In case of bypassing flush, we don't need to run queue because every blk_insert_flush() follows one run queue. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 4938bec8cfef9..81bd1a8430431 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, true, false, false); + blk_mq_sched_insert_request(rq, false, false, false, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; -- cgit v1.2.3 From b0850297c749ea79a5717d597931366b3d7f4b09 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:34 +0800 Subject: block: pass 'run_queue' to blk_mq_request_bypass_insert Block flush need this function without running the queue, so add a parameter controlling whether we run it or not. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 5 +++-- block/blk-mq.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 223f32826e62d..b8d1aa2d1008f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2397,7 +2397,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq); + blk_mq_request_bypass_insert(rq, true); return BLK_STS_OK; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 6eacc1dea8b74..021562bd5d2c3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1492,7 +1492,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq) +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); @@ -1501,7 +1501,8 @@ void blk_mq_request_bypass_insert(struct request *rq) list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - blk_mq_run_hw_queue(hctx, false); + if (run_queue) + blk_mq_run_hw_queue(hctx, false); } void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, diff --git a/block/blk-mq.h b/block/blk-mq.h index f97aceff76e91..9fffec0ad8e9d 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -56,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); -void blk_mq_request_bypass_insert(struct request *rq); +void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -- cgit v1.2.3 From 598906f814280762157629ba8833bf5cb11def74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:35 +0800 Subject: blk-flush: use blk_mq_request_bypass_insert() In the following patch, we will use RQF_FLUSH_SEQ to decide: 1) if the flag isn't set, the flush rq need to be inserted via blk_insert_flush() 2) otherwise, the flush rq need to be dispatched directly since it is in flush machinery now. So we use blk_mq_request_bypass_insert() for requests of bypassing flush machinery, just like the legacy path did. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 81bd1a8430431..a9773d2075ac6 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) - blk_mq_sched_insert_request(rq, false, false, false, false); + blk_mq_request_bypass_insert(rq, false); else list_add_tail(&rq->queuelist, &q->queue_head); return; -- cgit v1.2.3 From a6a252e6491443c1c18eab7e254daee63d4a7a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:36 +0800 Subject: blk-mq-sched: decide how to handle flush rq via RQF_FLUSH_SEQ In case of IO scheduler we always pre-allocate one driver tag before calling blk_insert_flush(), and flush request will be marked as RQF_FLUSH_SEQ once it is in flush machinery. So if RQF_FLUSH_SEQ isn't set, we call blk_insert_flush() to handle the request, otherwise the flush request is dispatched to ->dispatch list directly. This is a preparation patch for not preallocating a driver tag for flush requests, and for not treating flush requests as a special case. This is similar to what the legacy path does. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 13a27d4d1671d..e7094f44afafc 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -345,21 +345,23 @@ void blk_mq_sched_request_inserted(struct request *rq) EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, + bool has_sched, struct request *rq) { - if (rq->tag == -1) { + /* dispatch flush rq directly */ + if (rq->rq_flags & RQF_FLUSH_SEQ) { + spin_lock(&hctx->lock); + list_add(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + return true; + } + + if (has_sched) { rq->rq_flags |= RQF_SORTED; - return false; + WARN_ON(rq->tag != -1); } - /* - * If we already have a real request tag, send directly to - * the dispatch list. - */ - spin_lock(&hctx->lock); - list_add(&rq->queuelist, &hctx->dispatch); - spin_unlock(&hctx->lock); - return true; + return false; } /* @@ -385,12 +387,13 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { + /* flush rq in flush machinery need to be dispatched directly */ + if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { blk_mq_sched_insert_flush(hctx, rq, can_block); return; } - if (e && blk_mq_sched_bypass_insert(hctx, rq)) + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; if (e && e->type->ops.mq.insert_requests) { @@ -428,7 +431,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q, list_for_each_entry_safe(rq, next, list, queuelist) { if (WARN_ON_ONCE(rq->tag != -1)) { list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, rq); + blk_mq_sched_bypass_insert(hctx, true, rq); } } } -- cgit v1.2.3 From 244c65a3ccaa06fd15cc940315606674d3108b2f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Nov 2017 12:39:57 -0600 Subject: blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h We need this helper to put the driver tag for flush rq, since we will not share tag in the flush request sequence in the following patch in case that I/O scheduler is applied. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 -------------------------------- block/blk-mq.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 021562bd5d2c3..14f6886fbec80 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -996,38 +996,6 @@ done: return rq->tag != -1; } -static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); - rq->tag = -1; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } -} - -static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - __blk_mq_put_driver_tag(hctx, rq); -} - -static void blk_mq_put_driver_tag(struct request *rq) -{ - struct blk_mq_hw_ctx *hctx; - - if (rq->tag == -1 || rq->internal_tag == -1) - return; - - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - __blk_mq_put_driver_tag(hctx, rq); -} - /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a diff --git a/block/blk-mq.h b/block/blk-mq.h index 9fffec0ad8e9d..2502f40ccdc07 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -2,6 +2,7 @@ #define INT_BLK_MQ_H #include "blk-stat.h" +#include "blk-mq-tag.h" struct blk_mq_tag_set; @@ -156,4 +157,36 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) return true; } +static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + #endif -- cgit v1.2.3 From 923218f6166a84688973acdc39094f3bee1e9ad4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Nov 2017 23:24:38 +0800 Subject: blk-mq: don't allocate driver tag upfront for flush rq The idea behind it is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and that causes an IO hang. And get/put driver tag is actually noop for none, so reordering tags isn't necessary at all. 2) for a real I/O scheduler, we need not allocate a driver tag upfront for flush rq. It works just fine to follow the same approach as normal requests: allocate driver tag for each rq just before calling ->queue_rq(). One driver visible change is that the driver tag isn't shared in the flush request sequence. That won't be a problem, since we always do that in legacy path. Then flush rq need not be treated specially wrt. get/put driver tag. This cleans up the code - for instance, reorder_tags_to_front() can be removed, and we needn't worry about request ordering in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 35 ++++++++++++++++++++++++++--------- block/blk-mq-sched.c | 42 +++++------------------------------------- block/blk-mq.c | 41 ++++++----------------------------------- 3 files changed, 37 insertions(+), 81 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index a9773d2075ac6..f17170675917b 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; + } } running = &fq->flush_queue[fq->flush_running_idx]; @@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) blk_rq_init(q, flush_rq); /* - * Borrow tag from the first request since they can't - * be in flight at the same time. And acquire the tag's - * ownership for flush req. + * In case of none scheduler, borrow tag from the first request + * since they can't be in flight at the same time. And acquire + * the tag's ownership for flush req. + * + * In case of IO scheduler, flush rq need to borrow scheduler tag + * just for cheating put/get driver tag. */ if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; flush_rq->mq_ctx = first_rq->mq_ctx; - flush_rq->tag = first_rq->tag; - fq->orig_rq = first_rq; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; + } } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) hctx = blk_mq_map_queue(q, ctx->cpu); + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag_hctx(hctx, rq); + } + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e7094f44afafc..01a43fed6b8ce 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } - if (has_sched) { + if (has_sched) rq->rq_flags |= RQF_SORTED; - WARN_ON(rq->tag != -1); - } return false; } -/* - * Add flush/fua to the queue. If we fail getting a driver tag, then - * punt to the requeue list. Requeue will re-invoke us from a context - * that's safe to block from. - */ -static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool can_block) -{ - if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { - blk_insert_flush(rq); - blk_mq_run_hw_queue(hctx, true); - } else - blk_mq_add_to_requeue_list(rq, false, true); -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { @@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, /* flush rq in flush machinery need to be dispatched directly */ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { - blk_mq_sched_insert_flush(hctx, rq, can_block); - return; + blk_insert_flush(rq); + goto run; } + WARN_ON(e && (rq->tag != -1)); + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; @@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct elevator_queue *e = hctx->queue->elevator; - if (e) { - struct request *rq, *next; - - /* - * We bypass requests that already have a driver tag assigned, - * which should only be flushes. Flushes are only ever inserted - * as single requests, so we shouldn't ever hit the - * WARN_ON_ONCE() below (but let's handle it just in case). - */ - list_for_each_entry_safe(rq, next, list, queuelist) { - if (WARN_ON_ONCE(rq->tag != -1)) { - list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, true, rq); - } - } - } - if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); else diff --git a/block/blk-mq.c b/block/blk-mq.c index 14f6886fbec80..c501cbd0de93f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_put_driver_tag(rq); + trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); @@ -996,30 +998,6 @@ done: return rq->tag != -1; } -/* - * If we fail getting a driver tag because all the driver tags are already - * assigned and on the dispatch list, BUT the first entry does not have a - * tag, then we could deadlock. For that case, move entries with assigned - * driver tags to the front, leaving the set of tagged requests in the - * same order, and the untagged set in the same order. - */ -static bool reorder_tags_to_front(struct list_head *list) -{ - struct request *rq, *tmp, *first = NULL; - - list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { - if (rq == first) - break; - if (rq->tag != -1) { - list_move(&rq->queuelist, list); - if (!first) - first = rq; - } - } - - return first != NULL; -} - static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { @@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { - if (!queued && reorder_tags_to_front(list)) - continue; - /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. @@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, nxt = list_first_entry(list, struct request, queuelist); blk_mq_put_driver_tag(nxt); } - blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - if (q->elevator) { - blk_mq_sched_insert_request(rq, false, true, true, - true); - } else { - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } + + /* bypass scheduler for flush rq */ + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; -- cgit v1.2.3 From ff57dc94faec023abc267cdc45766fccff497557 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Nov 2017 16:11:57 -0500 Subject: nbd: wait uninterruptible for the dead timeout If we have a pending signal or the user kills their application then it'll bring down the whole device, which is less than awesome. Instead wait uninterruptible for the dead timeout so we're sure we gave it our best shot. Fixes: 560bc4b39952 ("nbd: handle dead connections") Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3684e21d543f2..b94a735bfe3c0 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -715,9 +715,9 @@ static int wait_for_reconnect(struct nbd_device *nbd) return 0; if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) return 0; - wait_event_interruptible_timeout(config->conn_wait, - atomic_read(&config->live_connections), - config->dead_conn_timeout); + wait_event_timeout(config->conn_wait, + atomic_read(&config->live_connections), + config->dead_conn_timeout); return atomic_read(&config->live_connections); } -- cgit v1.2.3 From 6a468d5990ecd1c2d07dd85f8633bbdd0ba61c40 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Nov 2017 16:11:58 -0500 Subject: nbd: don't start req until after the dead connection logic We can end up sleeping for a while waiting for the dead timeout, which means we could get the per request timer to fire. We did handle this case, but if the dead timeout happened right after we submitted we'd either tear down the connection or possibly requeue as we're handling an error and race with the endio which can lead to panics and other hilarity. Fixes: 560bc4b39952 ("nbd: handle dead connections") Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b94a735bfe3c0..95cab69d9c8be 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -289,15 +289,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } - - /* If we are waiting on our dead timer then we could get timeout - * callbacks for our request. For this we just want to reset the timer - * and let the queue side take care of everything. - */ - if (!completion_done(&cmd->send_complete)) { - nbd_config_put(nbd); - return BLK_EH_RESET_TIMER; - } config = nbd->config; if (config->num_connections > 1) { @@ -732,6 +723,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); + blk_mq_start_request(req); return -EINVAL; } config = nbd->config; @@ -740,6 +732,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); + blk_mq_start_request(req); return -EINVAL; } cmd->status = BLK_STS_OK; @@ -763,6 +756,7 @@ again: */ sock_shutdown(nbd); nbd_config_put(nbd); + blk_mq_start_request(req); return -EIO; } goto again; @@ -773,6 +767,7 @@ again: * here so that it gets put _after_ the request that is already on the * dispatch list. */ + blk_mq_start_request(req); if (unlikely(nsock->pending && nsock->pending != req)) { blk_mq_requeue_request(req, true); ret = 0; @@ -785,10 +780,10 @@ again: ret = nbd_send_cmd(nbd, cmd, index); if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(nbd->disk), - "Request send failed trying another connection\n"); + "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); - mutex_unlock(&nsock->tx_lock); - goto again; + blk_mq_requeue_request(req, true); + ret = 0; } out: mutex_unlock(&nsock->tx_lock); @@ -812,7 +807,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * done sending everything over the wire. */ init_completion(&cmd->send_complete); - blk_mq_start_request(bd->rq); /* We can be called directly from the user space process, which means we * could possibly have signals pending so our sendmsg will fail. In -- cgit v1.2.3 From e8815241173ebba4ccad490d4907b354b795fa5c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Tue, 7 Nov 2017 22:23:01 +0900 Subject: null_blk: fix default values in documentation null_blk.c has initial value of (1) nr_devices as 1. (2) completion_nsec as 10,000ns, not 10.000ns. documentation should be updated for fixes above. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- Documentation/block/null_blk.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index b7161c8e70f2e..fcf7f8487d8ee 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -38,7 +38,7 @@ gb=[Size in GB]: Default: 250GB bs=[Block size (in bytes)]: Default: 512 bytes The block size reported to the system. -nr_devices=[Number of devices]: Default: 2 +nr_devices=[Number of devices]: Default: 1 Number of block devices instantiated. They are instantiated as /dev/nullb0, etc. @@ -52,7 +52,7 @@ irqmode=[0-2]: Default: 1-Soft-irq 2: Timer: Waits a specific period (completion_nsec) for each IO before completion. -completion_nsec=[ns]: Default: 10.000ns +completion_nsec=[ns]: Default: 10,000ns Combined with irqmode=2 (timer). The time each completion event must wait. submit_queues=[1..nr_cpus]: -- cgit v1.2.3 From bf9fc98b736b8a3837040d5ce3a7caf04fa0859c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Tue, 7 Nov 2017 09:25:37 -0700 Subject: null_blk: add an usage for shared tags in documentation Add usage explanation for a shared_tags, introduced by commit: 82f402fefa50 ("null_blk: add support for shared tags") Signed-off-by: Minwoo Im Reworded slightly. Signed-off-by: Jens Axboe --- Documentation/block/null_blk.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index fcf7f8487d8ee..733927a7b501a 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -77,3 +77,8 @@ use_lightnvm=[0/1]: Default: 0 no_sched=[0/1]: Default: 0 0: nullb* use default blk-mq io scheduler. 1: nullb* doesn't use io scheduler. + +shared_tags=[0/1]: Default: 0 + 0: Tag set is not shared. + 1: Tag set shared between devices for blk-mq. Only makes sense with + nr_devices > 1, otherwise there's no tag set to share. -- cgit v1.2.3 From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:13:48 -0700 Subject: blktrace: fix unlocked access to init/start-stop/teardown sg.c calls into the blktrace functions without holding the proper queue mutex for doing setup, start/stop, or teardown. Add internal unlocked variants, and export the ones that do the proper locking. Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices") Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 45a3928544ce5..ea57dd94b2b26 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) blk_unregister_tracepoints(); } -int blk_trace_remove(struct request_queue *q) +static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; @@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q) return 0; } + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, @@ -550,9 +561,8 @@ err: return ret; } -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; } + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, } #endif -int blk_trace_startstop(struct request_queue *q, int start) +static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; struct blk_trace *bt = q->blk_trace; @@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start) return ret; } + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_startstop); /* @@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) case BLKTRACESTART: start = 1; case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); + ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); + ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; @@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { + mutex_lock(&q->blk_trace_mutex); + if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); } + + mutex_unlock(&q->blk_trace_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:16:09 -0700 Subject: blktrace: fix unlocked registration of tracepoints We need to ensure that tracepoints are registered and unregistered with the users of them. The existing atomic count isn't enough for that. Add a lock around the tracepoints, so we serialize access to them. This fixes cases where we have multiple users setting up and tearing down tracepoints, like this: CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted 4.14.0-rc5-next-20171018+ #36 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x41c kernel/panic.c:183 __warn+0x1c4/0x1e0 kernel/panic.c:546 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177 do_trap_no_signal arch/x86/kernel/traps.c:211 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:260 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline] RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283 RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293 RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07 RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818 RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004 R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0 R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0 tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304 register_trace_block_rq_insert include/trace/events/block.h:191 [inline] blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043 do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542 blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564 sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x444339 RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339 RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009 RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000 since we can now run these in parallel. Ensure that the exported helpers for doing this are grabbing the queue trace mutex. Reported-by: Steven Rostedt Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ea57dd94b2b26..206e0e2ace53d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); @@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt) kfree(bt); } +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + static void blk_trace_cleanup(struct blk_trace *bt) { blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); + put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) @@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); ret = 0; err: @@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - + put_probe_ref(); blk_trace_free(bt); return 0; } @@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q, if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; free_bt: -- cgit v1.2.3 From e54b064cb24c8268252336ccdd6523e08c0dcbde Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:51 +0300 Subject: nvme: move the dying queue check from cancel to completion With multipath we don't want a hard DNR bit on a request that is cancelled by a controller reset, but instead want to be able to retry it on another patch. To archive this don't always set the DNR bit when the queue is dying in nvme_cancel_request, but defer that decision to nvme_req_needs_retry. Note that it applies to any command there and not just cancelled commands, but one the queue is dying that is the right thing to do anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 64744355aa88e..a9c7ad304ec5d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -172,6 +172,8 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; + if (blk_queue_dying(req->q)) + return false; return true; } @@ -189,18 +191,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq); void nvme_cancel_request(struct request *req, void *data, bool reserved) { - int status; - if (!blk_mq_request_started(req)) return; dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); - status = NVME_SC_ABORT_REQ; - if (blk_queue_dying(req->q)) - status |= NVME_SC_DNR; - nvme_req(req)->status = status; + nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req); } -- cgit v1.2.3 From b5be3b392998a5f3763dfbdb1866f81d8ee631c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:52 +0300 Subject: nvme: always unregister the integrity profile in __nvme_revalidate_disk This is safe because the queue is always frozen when we revalidate, and it simplifies both the existing code as well as the multipath implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a9c7ad304ec5d..da5357c66a0f2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1081,29 +1081,6 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, - u16 bs) -{ - struct nvme_ns *ns = disk->private_data; - u16 old_ms = ns->ms; - u8 pi_type = 0; - - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* PI implementation requires metadata equal t10 pi tuple size */ - if (ns->ms == sizeof(struct t10_pi_tuple)) - pi_type = id->dps & NVME_NS_DPS_PI_MASK; - - if (blk_get_integrity(disk) && - (ns->pi_type != pi_type || ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; -} - static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; @@ -1130,10 +1107,6 @@ static void nvme_init_integrity(struct nvme_ns *ns) blk_queue_max_integrity_segments(ns->queue, 1); } #else -static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, - u16 bs) -{ -} static void nvme_init_integrity(struct nvme_ns *ns) { } @@ -1202,15 +1175,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = 9; bs = 1 << ns->lba_shift; ns->noiob = le16_to_cpu(id->noiob); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + /* the PI implementation requires metadata equal t10 pi tuple size */ + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); - if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) - nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); if (ns->noiob) nvme_set_chunk_size(ns); - if (ns->ms && !blk_get_integrity(disk) && !ns->ext) + if (ns->ms && !ns->ext && + (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); -- cgit v1.2.3 From 39b7baa410fd00a0fe9da864400d95b3d5008de7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:53 +0300 Subject: nvme: don't pass struct nvme_ns to nvme_init_integrity To allow reusing this function for the multipath node. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da5357c66a0f2..88d886b390f41 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1081,12 +1081,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct nvme_ns *ns) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) { struct blk_integrity integrity; memset(&integrity, 0, sizeof(integrity)); - switch (ns->pi_type) { + switch (pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; integrity.tag_size = sizeof(u16) + sizeof(u32); @@ -1102,12 +1102,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) integrity.profile = NULL; break; } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); + integrity.tuple_size = ms; + blk_integrity_register(disk, &integrity); + blk_queue_max_integrity_segments(disk->queue, 1); } #else -static void nvme_init_integrity(struct nvme_ns *ns) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -1191,7 +1191,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) nvme_set_chunk_size(ns); if (ns->ms && !ns->ext && (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - nvme_init_integrity(ns); + nvme_init_integrity(disk, ns->ms, ns->pi_type); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); else -- cgit v1.2.3 From 30e5e929c7bfaf87a65f926715773ef14339b79f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:54 +0300 Subject: nvme: don't pass struct nvme_ns to nvme_config_discard To allow reusing this function for the multipath node. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88d886b390f41..551ec5df6a97f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1118,29 +1118,26 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct nvme_ctrl *ctrl, + unsigned stream_alignment, struct request_queue *queue) { - struct nvme_ctrl *ctrl = ns->ctrl; - u32 logical_block_size = queue_logical_block_size(ns->queue); + u32 size = queue_logical_block_size(queue); + + if (stream_alignment) + size *= stream_alignment; BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - if (ctrl->nr_streams && ns->sws && ns->sgs) { - unsigned int sz = logical_block_size * ns->sws * ns->sgs; + queue->limits.discard_alignment = size; + queue->limits.discard_granularity = size; - ns->queue->limits.discard_alignment = sz; - ns->queue->limits.discard_granularity = sz; - } else { - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - } - blk_queue_max_discard_sectors(ns->queue, UINT_MAX); - blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); + blk_queue_max_discard_sectors(queue, UINT_MAX); + blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); + blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1164,6 +1161,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + unsigned stream_alignment = 0; u16 bs; /* @@ -1183,6 +1181,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else ns->pi_type = 0; + if (ctrl->nr_streams && ns->sws && ns->sgs) + stream_alignment = ns->sws * ns->sgs; + blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); @@ -1198,7 +1199,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); + nvme_config_discard(ctrl, stream_alignment, disk->queue); blk_mq_unfreeze_queue(disk->queue); } -- cgit v1.2.3 From 6e78f21ae4488cdab4318f781de05c8ffc4de951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:55 +0300 Subject: nvme: set the chunk size before freezing the queue We don't need a frozen queue to update the chunk_size, which just is a hint, and moving it a little earlier will allow for some better code reuse with the multipath code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 551ec5df6a97f..a4be843334baf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1184,12 +1184,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ctrl->nr_streams && ns->sws && ns->sgs) stream_alignment = ns->sws * ns->sgs; + if (ns->noiob) + nvme_set_chunk_size(ns); + blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); blk_queue_logical_block_size(ns->queue, bs); - if (ns->noiob) - nvme_set_chunk_size(ns); if (ns->ms && !ns->ext && (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); -- cgit v1.2.3 From 24b0b58c5b4a9f6f4bbd1c1efd5afb9a565da3ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:56 +0300 Subject: nvme: split __nvme_revalidate_disk Split out the code that applies the calculate value to a given disk/queue into new helper that can be reused by the multipath code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 49 +++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4be843334baf..28d58d353a420 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1157,12 +1157,34 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, } } +static void nvme_update_disk_info(struct gendisk *disk, + struct nvme_ns *ns, struct nvme_id_ns *id) +{ + sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); + unsigned stream_alignment = 0; + + if (ns->ctrl->nr_streams && ns->sws && ns->sgs) + stream_alignment = ns->sws * ns->sgs; + + blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); + + blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift); + if (ns->ms && !ns->ext && + (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + nvme_init_integrity(disk, ns->ms, ns->pi_type); + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + capacity = 0; + set_capacity(disk, capacity); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns->ctrl, stream_alignment, disk->queue); + blk_mq_unfreeze_queue(disk->queue); +} + static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - unsigned stream_alignment = 0; - u16 bs; /* * If identify namespace failed, use default 512 byte block size so @@ -1171,7 +1193,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; - bs = 1 << ns->lba_shift; ns->noiob = le16_to_cpu(id->noiob); ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); @@ -1181,27 +1202,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else ns->pi_type = 0; - if (ctrl->nr_streams && ns->sws && ns->sgs) - stream_alignment = ns->sws * ns->sgs; - if (ns->noiob) nvme_set_chunk_size(ns); - - blk_mq_freeze_queue(disk->queue); - blk_integrity_unregister(disk); - - blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !ns->ext && - (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - nvme_init_integrity(disk, ns->ms, ns->pi_type); - if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ctrl, stream_alignment, disk->queue); - blk_mq_unfreeze_queue(disk->queue); + nvme_update_disk_info(disk, ns, id); } static int nvme_revalidate_disk(struct gendisk *disk) -- cgit v1.2.3 From 715ea9e09dc81f18cad5ed64800f49b1c4c444df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Nov 2017 17:27:34 +0100 Subject: nvme: fix and clarify the check for missing metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the check in nvme_setup_rw for missing metadata so that it is together with the other metadata handling, does not contain impossible to reach conditions and warns if we get an impossible requests for a (non-PI) metadata-enabled namespace when CONFIG_BLK_DEV_INTEGRITY is not set. Also add a little helper that checks if a given metadata configuration contains protection information Signed-off-by: Christoph Hellwig Reported-by: Javier González Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 28d58d353a420..3aabaa103e106 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -137,6 +137,11 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); +static inline bool nvme_ns_has_pi(struct nvme_ns *ns) +{ + return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); +} + static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { @@ -472,16 +477,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, u16 control = 0; u32 dsmgmt = 0; - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns && ns->ms && - (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && - !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) - return BLK_STS_NOTSUPP; - if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) @@ -500,6 +495,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); if (ns->ms) { + /* + * If formated with metadata, the block layer always provides a + * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else + * we enable the PRACT bit for protection information or set the + * namespace capacity to zero to prevent any I/O. + */ + if (!blk_integrity_rq(req)) { + if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) + return BLK_STS_NOTSUPP; + control |= NVME_RW_PRINFO_PRACT; + } + switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: control |= NVME_RW_PRINFO_PRCHK_GUARD; @@ -512,8 +519,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, nvme_block_nr(ns, blk_rq_pos(req))); break; } - if (!blk_integrity_rq(req)) - control |= NVME_RW_PRINFO_PRACT; } cmnd->rw.control = cpu_to_le16(control); @@ -1173,7 +1178,7 @@ static void nvme_update_disk_info(struct gendisk *disk, if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); - if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) capacity = 0; set_capacity(disk, capacity); -- cgit v1.2.3 From c627c487ec727350c6aaa1cf217b666099e83ee2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:31 -0700 Subject: nvme: factor get log into a helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And fix the warning on a successful firmware log. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3aabaa103e106..90b6375a9da5b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1747,6 +1747,18 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); } +static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, + size_t size) +{ + struct nvme_command c = { }; + + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(NVME_NSID_ALL); + c.common.cdw10[0] = nvme_get_log_dw10(log_page, size); + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -2579,18 +2591,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) { - struct nvme_command c = { }; struct nvme_fw_slot_info_log *log; log = kmalloc(sizeof(*log), GFP_KERNEL); if (!log) return; - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(NVME_NSID_ALL); - c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); - - if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) + if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); -- cgit v1.2.3 From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:32 -0700 Subject: nvme: check admin passthru command effects The NVMe standard provides a command effects log page so the host may be aware of special requirements it may need to do for a particular command. For example, the command may need to run with IO quiesced to prevent timeouts or undefined behavior, or it may change the logical block formats that determine how the host needs to construct future commands. This patch saves the nvme command effects log page if the controller supports it, and performs appropriate actions before and after an admin passthrough command is completed. If the controller does not support the command effects log page, the driver will define the effects for known opcodes. The nvme format and santize are the only commands in this patch with known effects. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 18 ++++++++ 3 files changed, 126 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 90b6375a9da5b..65fd2fc1ae3c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -72,6 +72,9 @@ static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_chr_devt; static struct class *nvme_class; +static void nvme_ns_remove(struct nvme_ns *ns); +static int nvme_revalidate_disk(struct gendisk *disk); + static __le32 nvme_get_log_dw10(u8 lid, size_t size) { return cpu_to_le32((((size / 4) - 1) << 16) | lid); @@ -992,12 +995,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, io.slba, NULL, 0); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + if (effects & ~NVME_CMD_EFFECTS_CSUPP) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + else + effects = nvme_known_admin_effects(opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->disk && nvme_revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) + nvme_unfreeze(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) + nvme_queue_scan(ctrl); +} + static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; + u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1023,10 +1101,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, 0, &cmd.result, timeout); + nvme_passthru_end(ctrl, effects); + if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; @@ -1759,6 +1840,25 @@ static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } +static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +{ + int ret; + + if (!ctrl->effects) + ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + + if (!ctrl->effects) + return 0; + + ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, + sizeof(*ctrl->effects)); + if (ret) { + kfree(ctrl->effects); + ctrl->effects = NULL; + } + return ret; +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1794,6 +1894,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { + ret = nvme_get_effects_log(ctrl); + if (ret < 0) + return ret; + } + nvme_init_subnqn(ctrl, id); if (!ctrl->identified) { @@ -2713,6 +2819,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); + kfree(ctrl->effects); ctrl->ops->free_ctrl(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 35d9cee515f1b..f7c21cea94853 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -174,6 +174,7 @@ struct nvme_ctrl { bool subsystem; unsigned long quirks; struct nvme_id_power_state psd[32]; + struct nvme_effects_log *effects; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9310ce77d8e1f..fd1d4508a6129 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -267,6 +267,7 @@ enum { NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, }; struct nvme_lbaf { @@ -395,6 +396,21 @@ struct nvme_fw_slot_info_log { __u8 rsvd64[448]; }; +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -681,6 +697,7 @@ enum nvme_admin_opcode { nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, }; enum { @@ -712,6 +729,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CMD_EFFECTS = 0x05, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), -- cgit v1.2.3 From c5760f300e25f7d2cea9a9002b1e058a5c18b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 08:13:16 -0700 Subject: nvme-fc: fix localport resume using stale values The localport resume was not updating the lldd ops structure. If the lldd is unloaded and reloaded, the ops pointers will differ. Additionally, as there are device references taken by the localport, ensure that resume only resumes if the device matches as well. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 113c30be7276b..c49971d1a0515 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -268,7 +268,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport) static struct nvme_fc_lport * -nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) +nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *ops, + struct device *dev) { struct nvme_fc_lport *lport; unsigned long flags; @@ -280,6 +282,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) lport->localport.port_name != pinfo->port_name) continue; + if (lport->dev != dev) { + lport = ERR_PTR(-EXDEV); + goto out_done; + } + if (lport->localport.port_state != FC_OBJSTATE_DELETED) { lport = ERR_PTR(-EEXIST); goto out_done; @@ -296,6 +303,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) /* resume the lport */ + lport->ops = ops; lport->localport.port_role = pinfo->port_role; lport->localport.port_id = pinfo->port_id; lport->localport.port_state = FC_OBJSTATE_ONLINE; @@ -356,7 +364,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, * expired, we can simply re-enable the localport. Remoteports * and controller reconnections should resume naturally. */ - newrec = nvme_fc_attach_to_unreg_lport(pinfo); + newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev); /* found an lport, but something about its state is bad */ if (IS_ERR(newrec)) { -- cgit v1.2.3 From 158bfb8888c3f2beab747ab4f7a8197639f03e54 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 08:13:17 -0700 Subject: nvme-fc: decouple ns references from lldd references In the lldd api, a lldd may unregister a remoteport (loss of connectivity or driver unload) or localport (driver unload). The lldd must wait for the remoteport_delete or localport_delete before completing its actions post the unregister. The xxx_deletes currently occur only when the xxxport structure is fully freed after all references are removed. Thus the lldd may be held hostage until an app or in-kernel entity that has a namespace open finally closes so the namespace can be removed, the controller removed, thus the transport objects, thus the lldd. This patch decouples the transport and os-facing objects from the lldd and the remoteport and localport. There is a point in all deletions where the transport will no longer interact with the lldd on behalf of a controller. That point centers around the association established with the target/subsystem. It will access the lldd whenever it attempts to create an association and while the association is active. New associations may only be created if the remoteport is live (thus the localport is live). It will not access the lldd after deleting the association. Therefore, the patch tracks the count of active controllers - those with associations being created or that are active - on a remoteport. It also tracks the number of remoteports that have active controllers, on a a localport. When a remoteport is unregistered, as soon as there are no active controllers, the lldd's remoteport_delete may be called and the lldd may continue. Similarly, when a localport is unregistered, as soon as there are no remoteports with active controllers, the localport_delete callback may be made. This significantly speeds up unregistration with the lldd. The transport objects continue in suspended status with reconnect timers running, and upon expiration, normal ref-counting will occur and the objects will be freed. The transport object may still be held hostage by the application/kernel module, but that is acceptable. With this change, the lldd may be fully unloaded and reloaded, and if registrations occur prior to the timeouts, the nvme controller and namespaces will resume normally as if a link bounce. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c49971d1a0515..da9296a1eb269 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -126,6 +126,7 @@ struct nvme_fc_lport { struct device *dev; /* physical device for dma */ struct nvme_fc_port_template *ops; struct kref ref; + atomic_t act_rport_cnt; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ struct nvme_fc_rport { @@ -138,6 +139,7 @@ struct nvme_fc_rport { struct nvme_fc_lport *lport; spinlock_t lock; struct kref ref; + atomic_t act_ctrl_cnt; unsigned long dev_loss_end; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ @@ -153,6 +155,7 @@ struct nvme_fc_ctrl { struct nvme_fc_rport *rport; u32 cnum; + bool assoc_active; u64 association_id; struct list_head ctrl_list; /* rport->ctrl_list */ @@ -243,9 +246,6 @@ nvme_fc_free_lport(struct kref *ref) list_del(&lport->port_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); - /* let the LLDD know we've finished tearing it down */ - lport->ops->localport_delete(&lport->localport); - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); ida_destroy(&lport->endp_cnt); @@ -400,6 +400,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, INIT_LIST_HEAD(&newrec->port_list); INIT_LIST_HEAD(&newrec->endp_list); kref_init(&newrec->ref); + atomic_set(&newrec->act_rport_cnt, 0); newrec->ops = template; newrec->dev = dev; ida_init(&newrec->endp_cnt); @@ -462,6 +463,9 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) spin_unlock_irqrestore(&nvme_fc_lock, flags); + if (atomic_read(&lport->act_rport_cnt) == 0) + lport->ops->localport_delete(&lport->localport); + nvme_fc_lport_put(lport); return 0; @@ -515,9 +519,6 @@ nvme_fc_free_rport(struct kref *ref) list_del(&rport->endp_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); - /* let the LLDD know we've finished tearing it down */ - lport->ops->remoteport_delete(&rport->remoteport); - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); kfree(rport); @@ -704,6 +705,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, INIT_LIST_HEAD(&newrec->ctrl_list); INIT_LIST_HEAD(&newrec->ls_req_list); kref_init(&newrec->ref); + atomic_set(&newrec->act_ctrl_cnt, 0); spin_lock_init(&newrec->lock); newrec->remoteport.localport = &lport->localport; newrec->dev = lport->dev; @@ -859,6 +861,9 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) nvme_fc_abort_lsops(rport); + if (atomic_read(&rport->act_ctrl_cnt) == 0) + rport->lport->ops->remoteport_delete(portptr); + /* * release the reference, which will allow, if all controllers * go away, which should only occur after dev_loss_tmo occurs, @@ -2639,6 +2644,61 @@ out_free_io_queues: return ret; } +static void +nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + + atomic_inc(&lport->act_rport_cnt); +} + +static void +nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + cnt = atomic_dec_return(&lport->act_rport_cnt); + if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED) + lport->ops->localport_delete(&lport->localport); +} + +static int +nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + u32 cnt; + + if (ctrl->assoc_active) + return 1; + + ctrl->assoc_active = true; + cnt = atomic_inc_return(&rport->act_ctrl_cnt); + if (cnt == 1) + nvme_fc_rport_active_on_lport(rport); + + return 0; +} + +static int +nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + /* ctrl->assoc_active=false will be set independently */ + + cnt = atomic_dec_return(&rport->act_ctrl_cnt); + if (cnt == 0) { + if (rport->remoteport.port_state == FC_OBJSTATE_DELETED) + lport->ops->remoteport_delete(&rport->remoteport); + nvme_fc_rport_inactive_on_lport(rport); + } + + return 0; +} + /* * This routine restarts the controller on the host side, and * on the link side, recreates the controller association. @@ -2655,6 +2715,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) return -ENODEV; + if (nvme_fc_ctlr_active_on_rport(ctrl)) + return -ENOTUNIQ; + /* * Create the admin queue */ @@ -2762,6 +2825,8 @@ out_delete_hw_queue: __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); out_free_queue: nvme_fc_free_queue(&ctrl->queues[0]); + ctrl->assoc_active = false; + nvme_fc_ctlr_inactive_on_rport(ctrl); return ret; } @@ -2777,6 +2842,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) { unsigned long flags; + if (!ctrl->assoc_active) + return; + ctrl->assoc_active = false; + spin_lock_irqsave(&ctrl->lock, flags); ctrl->flags |= FCCTRL_TERMIO; ctrl->iocnt = 0; @@ -2849,6 +2918,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); nvme_fc_free_queue(&ctrl->queues[0]); + + nvme_fc_ctlr_inactive_on_rport(ctrl); } static void @@ -3048,6 +3119,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->rport = rport; ctrl->dev = lport->dev; ctrl->cnum = idx; + ctrl->assoc_active = false; get_device(ctrl->dev); kref_init(&ctrl->ref); -- cgit v1.2.3 From 6ddcf0a30adc5080504ca66f474101e7ad247dd7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 09:33:30 -0700 Subject: lpfc: tie in to new dev_loss_tmo interface in nvme transport This patch calls the new nvme transport routine for dev_loss_tmo whenever the SCSI fc transport calls the lldd to make a dynamic change to a remote ports dev_loss_tmo. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/lpfc/lpfc_attr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index c17677f494afe..3e02bc3a7c3f7 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport) continue; if (ndlp->rport) ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; +#if (IS_ENABLED(CONFIG_NVME_FC)) + if (ndlp->nrport) + nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport, + vport->cfg_devloss_tmo); +#endif } spin_unlock_irq(shost->host_lock); } -- cgit v1.2.3 From 43b92fd27aaef0f529c9321cfebbaec1d7b8f503 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 5 Nov 2017 08:43:01 +0000 Subject: nvmet-rdma: update queue list during ib_device removal A NULL deref happens when nvmet_rdma_remove_one() is called more than once (e.g. while connected via 2 ports). The first call frees the queues related to the first ib_device but doesn't remove them from the queue list. While calling nvmet_rdma_remove_one() for the second ib_device it goes over the full queue list again and we get the NULL deref. Fixes: f1d4ef7d ("nvmet-rdma: register ib_client to not deadlock in device removal") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76d2bb793afe5..3333d417b248b 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1512,15 +1512,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) { - struct nvmet_rdma_queue *queue; + struct nvmet_rdma_queue *queue, *tmp; /* Device is being removed, delete all queues using this device */ mutex_lock(&nvmet_rdma_queue_mutex); - list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { if (queue->dev->device != ib_device) continue; pr_info("Removing queue %d\n", queue->idx); + list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); -- cgit v1.2.3 From 1f61def985d896da5935ae85a69c79fc26c7952f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 6 Nov 2017 16:18:51 +0200 Subject: nvme-rdma: fix nvme_rdma_create_queue_ib error flow QP object is created using rdma_cm api, therefore the destruction should use the same api for symmetry. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 03644ecf68d29..32e21ab1ae520 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -493,7 +493,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) return 0; out_destroy_qp: - ib_destroy_qp(queue->qp); + rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); out_put_dev: -- cgit v1.2.3 From 18c53e40487f56369c3ba9331ec3597d9b48d97c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Tue, 7 Nov 2017 21:10:22 +0900 Subject: nvmet: fix comment typos in admin-cmd.c small typos fixed in admin-cmd.c Signed-off-by: Minwoo Im Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c4a0bf36e7521..9a7d6917e190c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -300,7 +300,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } /* - * nuse = ncap = nsze isn't aways true, but we have no way to find + * nuse = ncap = nsze isn't always true, but we have no way to find * that out from the underlying device. */ id->ncap = id->nuse = id->nsze = @@ -424,7 +424,7 @@ out: } /* - * A "mimimum viable" abort implementation: the command is mandatory in the + * A "minimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really * do a useful abort, so don't bother even with waiting for the command * to be exectuted and return immediately telling the command to abort -- cgit v1.2.3 From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 7 Nov 2017 11:09:50 -0800 Subject: kthread: zero the kthread data structure kthread() could bail out early before we initialize blkcg_css (if the kthread is killed very early. Please see xchg() statement in kthread()), which confuses free_kthread_struct. Instead of moving the blkcg_css initialization early, we simply zero the whole 'self' data structure, which doesn't sound much overhead. Reported-by: syzbot Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info") Cc: Andrew Morton Cc: Ingo Molnar Cc: Dmitry Vyukov Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index f87cd8b4eb2a1..8dbe2454cb1de 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -204,7 +204,7 @@ static int kthread(void *_create) struct kthread *self; int ret; - self = kmalloc(sizeof(*self), GFP_KERNEL); + self = kzalloc(sizeof(*self), GFP_KERNEL); set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ @@ -220,13 +220,9 @@ static int kthread(void *_create) do_exit(-ENOMEM); } - self->flags = 0; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_BLK_CGROUP - self->blkcg_css = NULL; -#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ -- cgit v1.2.3 From 05b79413946d8b2b58999ea1ae844b6fc3c54f61 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Nov 2017 10:38:29 -0700 Subject: Revert "blk-mq: don't handle TAG_SHARED in restart" This reverts commit 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b. We have cases that aren't covered 100% in the drivers, so for now we have to retain the shared tag restart loops. Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 01a43fed6b8ce..6f4bdb8209f7d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,17 +68,25 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; + return false; - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); + } else + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); - return; + return true; } + + return false; } /* @@ -362,6 +370,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return false; } +/** + * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list + * @pos: loop cursor. + * @skip: the list element that will not be examined. Iteration starts at + * @skip->next. + * @head: head of the list to examine. This list must have at least one + * element, namely @skip. + * @member: name of the list_head structure within typeof(*pos). + */ +#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ + for ((pos) = (skip); \ + (pos = (pos)->member.next != (head) ? list_entry_rcu( \ + (pos)->member.next, typeof(*pos), member) : \ + list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ + (pos) != (skip); ) + +/* + * Called after a driver tag has been freed to check whether a hctx needs to + * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware + * queues in a round-robin fashion if the tag set of @hctx is shared with other + * hardware queues. + */ +void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) +{ + struct blk_mq_tags *const tags = hctx->tags; + struct blk_mq_tag_set *const set = hctx->queue->tag_set; + struct request_queue *const queue = hctx->queue, *q; + struct blk_mq_hw_ctx *hctx2; + unsigned int i, j; + + if (set->flags & BLK_MQ_F_TAG_SHARED) { + /* + * If this is 0, then we know that no hardware queues + * have RESTART marked. We're done. + */ + if (!atomic_read(&queue->shared_hctx_restart)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu_rr(q, queue, &set->tag_list, + tag_set_list) { + queue_for_each_hw_ctx(q, hctx2, i) + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + goto done; + } + j = hctx->queue_num + 1; + for (i = 0; i < queue->nr_hw_queues; i++, j++) { + if (j == queue->nr_hw_queues) + j = 0; + hctx2 = queue->queue_hw_ctx[j]; + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + break; + } +done: + rcu_read_unlock(); + } else { + blk_mq_sched_restart_hctx(hctx); + } +} + void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { -- cgit v1.2.3 From 83f5f7ed72f316eda4c4c3833beebfe6578926f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Nov 2017 11:15:37 -0700 Subject: block: kill bio_kmap/kunmap_irq() There are no users of it anymore. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 ++-- include/linux/bio.h | 11 ----------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 9490f2845f069..01c0a03407cc1 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -216,7 +216,7 @@ may need to abort DMA operations and revert to PIO for the transfer, in which case a virtual mapping of the page is required. For SCSI it is also done in some scenarios where the low level driver cannot be trusted to handle a single sg entry correctly. The driver is expected to perform the -kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq +kmaps as needed on such occasions using the bio_kmap_irq and friends routines as appropriate. A driver could also use the blk_queue_bounce() routine on its own to bounce highmem i/o to low memory for specific requests if so desired. @@ -1137,7 +1137,7 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For PIO drivers (or drivers that need to revert to PIO transfer once in a while (IDE for example)), where the CPU is doing the actual data transfer a virtual mapping is needed. If the driver supports highmem I/O, -(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to +(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to temporarily map a bio into the virtual address space. diff --git a/include/linux/bio.h b/include/linux/bio.h index 9c75f58f6a501..1d7e63d7505f6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -573,17 +573,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) } #endif -static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, - unsigned long *flags) -{ - return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags); -} -#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) - -#define bio_kmap_irq(bio, flags) \ - __bio_kmap_irq((bio), (bio)->bi_iter, (flags)) -#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * -- cgit v1.2.3 From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Nov 2017 19:13:48 +0100 Subject: block: remove __bio_kmap_atomic This helper doesn't buy us much over calling kmap_atomic directly. In fact in the only caller it does a bit of useless work as the caller already has the bvec at hand, and said caller would even buggy for a multi-segment bio due to the use of this helper. So just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 11 +++++------ arch/xtensa/platforms/iss/simdisk.c | 4 ++-- block/blk-settings.c | 2 +- include/linux/bio.h | 12 ------------ 4 files changed, 8 insertions(+), 21 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 01c0a03407cc1..86927029a52db 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in which case a virtual mapping of the page is required. For SCSI it is also done in some scenarios where the low level driver cannot be trusted to handle a single sg entry correctly. The driver is expected to perform the -kmaps as needed on such occasions using the bio_kmap_irq and friends -routines as appropriate. A driver could also use the blk_queue_bounce() -routine on its own to bounce highmem i/o to low memory for specific requests -if so desired. +kmaps as needed on such occasions as appropriate. A driver could also use +the blk_queue_bounce() routine on its own to bounce highmem i/o to low +memory for specific requests if so desired. iii. The i/o scheduler algorithm itself can be replaced/set as appropriate @@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For PIO drivers (or drivers that need to revert to PIO transfer once in a while (IDE for example)), where the CPU is doing the actual data transfer a virtual mapping is needed. If the driver supports highmem I/O, -(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to -temporarily map a bio into the virtual address space. +(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map +a bio into the virtual address space. 8. Prior/Related/Impacted patches diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index c45b90bb93393..eacf1e433518b 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; bio_for_each_segment(bvec, bio, iter) { - char *buffer = __bio_kmap_atomic(bio, iter); + char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset; unsigned len = bvec.bv_len >> SECTOR_SHIFT; simdisk_transfer(dev, sector, len, buffer, bio_data_dir(bio) == WRITE); sector += len; - __bio_kunmap_atomic(buffer); + kunmap_atomic(buffer) } bio_endio(bio); diff --git a/block/blk-settings.c b/block/blk-settings.c index 8559e9563c525..48ebe6be07b75 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits); * Caveat: * The driver that does this *must* be able to deal appropriately * with buffers in "highmemory". This can be accomplished by either calling - * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling + * kmap_atomic() to get a temporary kernel mapping, or by calling * blk_queue_bounce() to create a buffer in normal memory. **/ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) diff --git a/include/linux/bio.h b/include/linux/bio.h index 1d7e63d7505f6..d4eec19a6d3c3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio) */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) -/* - * queues that have highmem support enabled may still need to revert to - * PIO transfers occasionally and thus map high pages temporarily. For - * permanent PIO fall back, user is probably better off disabling highmem - * I/O completely on that queue (see ide-dma for example) - */ -#define __bio_kmap_atomic(bio, iter) \ - (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ - bio_iter_iovec((bio), (iter)).bv_offset) - -#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) - /* * merge helpers etc */ -- cgit v1.2.3 From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 5 Nov 2017 10:36:31 +0300 Subject: block: pass full fmode_t to blk_verify_command Use the obvious calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bsg.c | 18 ++++++++---------- block/scsi_ioctl.c | 8 ++++---- drivers/scsi/sg.c | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index ee1335c68de7f..452f94f1c5d42 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, struct sg_io_v4 *hdr, struct bsg_device *bd, - fmode_t has_write_perm) + fmode_t mode) { struct scsi_request *req = scsi_req(rq); @@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(req->cmd, has_write_perm)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; @@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) if (IS_ERR(rq)) return rq; - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode); if (ret) goto out; @@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) } static int __bsg_write(struct bsg_device *bd, const char __user *buf, - size_t count, ssize_t *bytes_written, - fmode_t has_write_perm) + size_t count, ssize_t *bytes_written, fmode_t mode) { struct bsg_command *bc; struct request *rq; @@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); + rq = bsg_map_hdr(bd, &bc->hdr, mode); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) bsg_set_block(bd, file); bytes_written = 0; - ret = __bsg_write(bd, buf, count, &bytes_written, - file->f_mode & FMODE_WRITE); + ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); *ppos = bytes_written; @@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); + rq = bsg_map_hdr(bd, &hdr, file->f_mode); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 7440de44dd857..edcfff9745277 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); } -int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) +int blk_verify_command(unsigned char *cmd, fmode_t mode) { struct blk_cmd_filter *filter = &blk_default_cmd_filter; @@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) return 0; /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && has_write_perm) + if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) return 0; return -EPERM; @@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(req->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(req->cmd, mode)) return -EPERM; /* @@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(req->cmd, mode & FMODE_WRITE); + err = blk_verify_command(req->cmd, mode); if (err) goto error; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0419c2298eabd..92fd870e13156 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd) if (sfp->parentdp->device->type == TYPE_SCANNER) return 0; - return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE); + return blk_verify_command(cmd, filp->f_mode); } static int diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 225617dd0a3f4..1437ef4d80379 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, gfp_mask, 0); } -extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); +extern int blk_verify_command(unsigned char *cmd, fmode_t mode); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, -- cgit v1.2.3 From 0c6af1ccd5fd9ac640aef01c8de0043837451a04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 8 Nov 2017 09:11:22 +0800 Subject: blk-mq: put driver tag if dispatch budget can't be got We have to put the driver tag if dispatch budget can't be got, otherwise it might cause IO deadlock, especially in case that size of tags is very small. Fixes: de1482974080(blk-mq: introduce .get_budget and .put_budget in blk_mq_ops) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c501cbd0de93f..3d759bb8a5bb5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1080,8 +1080,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } } - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) + if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { + blk_mq_put_driver_tag(rq); break; + } list_del_init(&rq->queuelist); -- cgit v1.2.3 From a47619b5c6ead6c2e10b8114a5e9291a1679a7c3 Mon Sep 17 00:00:00 2001 From: Javier González Date: Wed, 8 Nov 2017 10:59:03 +0100 Subject: nvme: compare NQN string with right size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copy subnqns using NVMF_NQN_SIZE as it is < 256 Signed-off-by: Javier González Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 65fd2fc1ae3c2..8a60cc786220b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1810,7 +1810,7 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strcpy(ctrl->subnqn, id->subnqn); + strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } -- cgit v1.2.3 From ab083b11f6f43af071f9b507e48d212bfbb2beb3 Mon Sep 17 00:00:00 2001 From: Javier González Date: Wed, 8 Nov 2017 10:59:04 +0100 Subject: nvme: fix eui_show() print format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix print formatting, but keep the original output to prevent user breakage as suggested by Joe Perches. Signed-off-by: Javier González Reviewed-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a60cc786220b..6c79b73fe2c39 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2190,7 +2190,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8phd\n", ns->eui); + return sprintf(buf, "%8ph\n", ns->eui); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); -- cgit v1.2.3 From aba7afc5671c23beade64d10caf86e24a9105dab Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 8 Nov 2017 10:23:45 -0800 Subject: blk-mq: Avoid that request queue removal can trigger list corruption Avoid that removal of a request queue sporadically triggers the following warning: list_del corruption. next->prev should be ffff8807d649b970, but was 6b6b6b6b6b6b6b6b WARNING: CPU: 3 PID: 342 at lib/list_debug.c:56 __list_del_entry_valid+0x92/0xa0 Call Trace: process_one_work+0x11b/0x660 worker_thread+0x3d/0x3b0 kthread+0x129/0x140 ret_from_fork+0x27/0x40 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index b8d1aa2d1008f..5e81dcf4690a6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -339,6 +339,7 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } else { -- cgit v1.2.3 From dafc040ba6d813df225a25f8b4bc3eae2207a6be Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Nov 2017 12:00:29 +0200 Subject: nvmet: remove redundant memset if failed to get_smart_log failed We already allocated the buffer with kzalloc. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9a7d6917e190c..dd087be395b9e 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -144,10 +144,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) } smart_log = buf; status = nvmet_get_smart_log(req, smart_log); - if (status) { - memset(buf, '\0', data_len); + if (status) goto err; - } break; case NVME_LOG_FW_SLOT: /* -- cgit v1.2.3 From 4185f25acbc2a191341c7d8cf51e449698eceb06 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Nov 2017 12:00:30 +0200 Subject: nvmet: remove redundant local variable the status is either success or some status id and we don't need a local variable for it. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dd087be395b9e..90dcdc40ac71a 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { - u16 status; struct nvmet_ns *ns; u64 host_reads, host_writes, data_units_read, data_units_written; - status = NVME_SC_SUCCESS; ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); if (!ns) { - status = NVME_SC_INVALID_NS; pr_err("nvmet : Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); - goto out; + return NVME_SC_INVALID_NS; } host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); @@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); nvmet_put_namespace(ns); -out: - return status; + + return NVME_SC_SUCCESS; } static u16 nvmet_get_smart_log_all(struct nvmet_req *req, struct nvme_smart_log *slog) { - u16 status; u64 host_reads = 0, host_writes = 0; u64 data_units_read = 0, data_units_written = 0; struct nvmet_ns *ns; struct nvmet_ctrl *ctrl; - status = NVME_SC_SUCCESS; ctrl = req->sq->ctrl; rcu_read_lock(); @@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); - return status; + return NVME_SC_SUCCESS; } static u16 nvmet_get_smart_log(struct nvmet_req *req, -- cgit v1.2.3 From 38dabe210fbab4e7e8a03670ab3ba42f247ea08f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:10 -0700 Subject: nvme: centralize AEN defines All the transports were unnecessarilly duplicating the AEN request accounting. This patch defines everything in one place. Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 33 ++++++++++++--------------------- drivers/nvme/host/nvme.h | 1 - drivers/nvme/host/pci.c | 16 +++------------- drivers/nvme/host/rdma.c | 14 +++----------- drivers/nvme/target/loop.c | 14 +++----------- include/linux/nvme.h | 8 ++++++++ 7 files changed, 30 insertions(+), 58 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c79b73fe2c39..315087dfcd8de 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { - ctrl->event_limit = NVME_NR_AERS; + ctrl->event_limit = NVME_NR_AEN_COMMANDS; queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index da9296a1eb269..f3bfad47a95fb 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -30,15 +30,6 @@ /* *************************** Data Structures/Defines ****************** */ -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_FC_NR_AEN_COMMANDS 1 -#define NVME_FC_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) -#define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) - enum nvme_fc_queue_flags { NVME_FC_Q_CONNECTED = (1 << 0), }; @@ -170,7 +161,7 @@ struct nvme_fc_ctrl { u32 iocnt; wait_queue_head_t ioabort_wait; - struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; + struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS]; struct nvme_ctrl ctrl; }; @@ -1546,7 +1537,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) unsigned long flags; int i, ret; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) continue; @@ -1816,7 +1807,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) int i, ret; aen_op = ctrl->aen_ops; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, GFP_KERNEL); if (!private) @@ -1826,7 +1817,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) sqe = &cmdiu->sqe; ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], aen_op, (struct request *)NULL, - (AEN_CMDID_BASE + i)); + (NVME_AQ_BLK_MQ_DEPTH + i)); if (ret) { kfree(private); return ret; @@ -1839,7 +1830,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) memset(sqe, 0, sizeof(*sqe)); sqe->common.opcode = nvme_admin_async_event; /* Note: core layer may overwrite the sqe.command_id value */ - sqe->common.command_id = AEN_CMDID_BASE + i; + sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i; } return 0; } @@ -1851,7 +1842,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) int i; aen_op = ctrl->aen_ops; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (!aen_op->fcp_req.private) continue; @@ -2402,7 +2393,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) bool terminating = false; blk_status_t ret; - if (aer_idx > NVME_FC_NR_AEN_COMMANDS) + if (aer_idx > NVME_NR_AEN_COMMANDS) return; spin_lock_irqsave(&ctrl->lock, flags); @@ -2722,16 +2713,16 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH); + nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH); ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, - NVME_FC_AQ_BLKMQ_DEPTH); + NVME_AQ_BLK_MQ_DEPTH); if (ret) goto out_free_queue; ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], - NVME_FC_AQ_BLKMQ_DEPTH, - (NVME_FC_AQ_BLKMQ_DEPTH / 4)); + NVME_AQ_BLK_MQ_DEPTH, + (NVME_AQ_BLK_MQ_DEPTH / 4)); if (ret) goto out_delete_hw_queue; @@ -3145,7 +3136,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f7c21cea94853..a6d750cfa6b2a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -313,7 +313,6 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send); -#define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); void nvme_queue_async_events(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 32c413ec818c9..c3dfd84feef72 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -35,12 +35,6 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) - #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) static int use_threaded_interrupts; @@ -956,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, * for them but rather special case them here. */ if (unlikely(nvmeq->qid == 0 && - cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe->status, &cqe->result); return; @@ -1057,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; + c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx; spin_lock_irq(&nvmeq->q_lock); __nvme_submit_cmd(nvmeq, &c); @@ -1524,11 +1518,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - /* - * Subtract one to leave an empty queue entry for 'Full Queue' - * condition. See NVM-Express 1.2 specification, section 4.1.2. - */ - dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; + dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 32e21ab1ae520..008791bbdfb32 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -41,14 +41,6 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_RDMA_NR_AEN_COMMANDS 1 -#define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) - struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; @@ -690,7 +682,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set = &ctrl->admin_tag_set; memset(set, 0, sizeof(*set)); set->ops = &nvme_rdma_admin_mq_ops; - set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct nvme_rdma_request) + @@ -1318,7 +1310,7 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) memset(cmd, 0, sizeof(*cmd)); cmd->common.opcode = nvme_admin_async_event; - cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); @@ -1380,7 +1372,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) * for them but rather special case them here. */ if (unlikely(nvme_rdma_queue_idx(queue) == 0 && - cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index bc95c6ed531af..7258b796f209c 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -23,14 +23,6 @@ #define NVME_LOOP_MAX_SEGMENTS 256 -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_LOOP_NR_AEN_COMMANDS 1 -#define NVME_LOOP_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) - struct nvme_loop_iod { struct nvme_request nvme_req; struct nvme_command cmd; @@ -112,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) * for them but rather special case them here. */ if (unlikely(nvme_loop_queue_idx(queue) == 0 && - cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); } else { @@ -200,7 +192,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) memset(&iod->cmd, 0, sizeof(iod->cmd)); iod->cmd.common.opcode = nvme_admin_async_event; - iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, @@ -356,7 +348,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fd1d4508a6129..89ffa7eed2fd9 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -90,6 +90,14 @@ enum { }; #define NVME_AQ_DEPTH 32 +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) + +/* + * Subtract one to leave an empty queue entry for 'Full Queue' condition. See + * NVM-Express 1.2 specification, section 4.1.2. + */ +#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ -- cgit v1.2.3 From 08e1507544839b98fc3732aea935e70ed9c209ec Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:11 -0700 Subject: nvme-fc: remove unused "queue_size" field This was being saved in a structure, but never used anywhere. The queue size is obtained through other means, so there's no reason to duplicate this without a user for it. Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Reviewed-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f3bfad47a95fb..3aa5950291925 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -43,7 +43,6 @@ struct nvme_fc_queue { struct device *dev; struct blk_mq_hw_ctx *hctx; void *lldd_handle; - int queue_size; size_t cmnd_capsule_len; u32 qnum; u32 rqcnt; @@ -1886,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, } static void -nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) +nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx) { struct nvme_fc_queue *queue; @@ -1902,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) else queue->cmnd_capsule_len = sizeof(struct nvme_command); - queue->queue_size = queue_size; - /* * Considered whether we should allocate buffers for all SQEs * and CQEs and dma map them - mapping their respective entries @@ -2027,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) - nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); + nvme_fc_init_queue(ctrl, i); } static void @@ -2713,7 +2710,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH); + nvme_fc_init_queue(ctrl, 0); ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, NVME_AQ_BLK_MQ_DEPTH); -- cgit v1.2.3 From ad22c355b707a8d8d48e282aadc01c0b0604b2e9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:12 -0700 Subject: nvme: remove handling of multiple AEN requests The driver can handle tracking only one AEN request, so this patch removes handling for multiple ones. Reviewed-by: Christoph Hellwig Reviewed-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 +++------------------------- drivers/nvme/host/fc.c | 9 +++------ drivers/nvme/host/nvme.h | 3 +-- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 5 +---- drivers/nvme/target/loop.c | 2 +- 6 files changed, 11 insertions(+), 40 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 315087dfcd8de..dedbf12847b6f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2670,15 +2670,7 @@ static void nvme_async_event_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, async_event_work); - spin_lock_irq(&ctrl->lock); - while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { - int aer_idx = --ctrl->event_limit; - - spin_unlock_irq(&ctrl->lock); - ctrl->ops->submit_async_event(ctrl, aer_idx); - spin_lock_irq(&ctrl->lock); - } - spin_unlock_irq(&ctrl->lock); + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) @@ -2745,22 +2737,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res) { u32 result = le32_to_cpu(res->u32); - bool done = true; - switch (le16_to_cpu(status) >> 1) { - case NVME_SC_SUCCESS: - done = false; - /*FALLTHRU*/ - case NVME_SC_ABORT_REQ: - ++ctrl->event_limit; - if (ctrl->state == NVME_CTRL_LIVE) - queue_work(nvme_wq, &ctrl->async_event_work); - break; - default: - break; - } - - if (done) + if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return; switch (result & 0xff07) { @@ -2774,12 +2752,12 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, default: dev_warn(ctrl->device, "async event result %08x\n", result); } + queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { - ctrl->event_limit = NVME_NR_AEN_COMMANDS; queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3aa5950291925..6eb460b117d6f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2382,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) } static void -nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +nvme_fc_submit_async_event(struct nvme_ctrl *arg) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); struct nvme_fc_fcp_op *aen_op; @@ -2390,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) bool terminating = false; blk_status_t ret; - if (aer_idx > NVME_NR_AEN_COMMANDS) - return; - spin_lock_irqsave(&ctrl->lock, flags); if (ctrl->flags & FCCTRL_TERMIO) terminating = true; @@ -2401,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) if (terminating) return; - aen_op = &ctrl->aen_ops[aer_idx]; + aen_op = &ctrl->aen_ops[0]; ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, NVMEFC_FCP_NODATA); if (ret) dev_err(ctrl->ctrl.device, - "failed async event work [%d]\n", aer_idx); + "failed async event work\n"); } static void diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a6d750cfa6b2a..b55c97ecea31a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,7 +162,6 @@ struct nvme_ctrl { u16 nssa; u16 nr_streams; atomic_t abort_limit; - u8 event_limit; u8 vwc; u32 vs; u32 sgls; @@ -237,7 +236,7 @@ struct nvme_ctrl_ops { int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); - void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); + void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*reinit_request)(void *data, struct request *rq); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c3dfd84feef72..429d56f1a19e8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1043,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return __nvme_poll(nvmeq, tag); } -static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq = dev->queues[0]; @@ -1051,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx; + c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; spin_lock_irq(&nvmeq->q_lock); __nvme_submit_cmd(nvmeq, &c); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 008791bbdfb32..c8d854474a5b6 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1293,7 +1293,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) return queue->ctrl->tag_set.tags[queue_idx - 1]; } -static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); struct nvme_rdma_queue *queue = &ctrl->queues[0]; @@ -1303,9 +1303,6 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct ib_sge sge; int ret; - if (WARN_ON_ONCE(aer_idx != 0)) - return; - ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); memset(cmd, 0, sizeof(*cmd)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 7258b796f209c..f40e70eb4a380 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -184,7 +184,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) { struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); struct nvme_loop_queue *queue = &ctrl->queues[0]; -- cgit v1.2.3 From d99ca609a1b55f87a5e62a11ed70e4d091d815f0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:13 -0700 Subject: nvme: unexport starting async event work Async event work is for core use only and should not be called directly from drivers. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 +------- drivers/nvme/host/nvme.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dedbf12847b6f..c135d0aeebd78 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2756,12 +2756,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, } EXPORT_SYMBOL_GPL(nvme_complete_async_event); -void nvme_queue_async_events(struct nvme_ctrl *ctrl) -{ - queue_work(nvme_wq, &ctrl->async_event_work); -} -EXPORT_SYMBOL_GPL(nvme_queue_async_events); - void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_keep_alive(ctrl); @@ -2778,7 +2772,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_queue_async_events(ctrl); + queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b55c97ecea31a..151062573ece1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -314,7 +314,6 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); -void nvme_queue_async_events(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); -- cgit v1.2.3 From e3d7874dcf175cca2dca7795d6453f637ad8ba9b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:14 -0700 Subject: nvme: send uevent for some asynchronous events This will give udev a chance to observe and handle asynchronous event notifications and clear the log to unmask future events of the same type. The driver will create a change uevent of the asyncronuos event result before submitting the next AEN request to the device if a completed AEN event is of type error, smart, command set or vendor specific, Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 4 ++++ 3 files changed, 33 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c135d0aeebd78..683d890d73fa3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2665,11 +2665,28 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static void nvme_aen_uevent(struct nvme_ctrl *ctrl) +{ + char *envp[2] = { NULL, NULL }; + u32 aen_result = ctrl->aen_result; + + ctrl->aen_result = 0; + if (!aen_result) + return; + + envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); + if (!envp[0]) + return; + kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); + kfree(envp[0]); +} + static void nvme_async_event_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, async_event_work); + nvme_aen_uevent(ctrl); ctrl->ops->submit_async_event(ctrl); } @@ -2741,6 +2758,17 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return; + switch (result & 0x7) { + case NVME_AER_ERROR: + case NVME_AER_SMART: + case NVME_AER_CSS: + case NVME_AER_VS: + ctrl->aen_result = result; + break; + default: + break; + } + switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(ctrl->device, "rescanning\n"); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 151062573ece1..7b9cc7d616b75 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -168,6 +168,7 @@ struct nvme_ctrl { u16 kas; u8 npss; u8 apsta; + u32 aen_result; unsigned int shutdown_timeout; unsigned int kato; bool subsystem; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 89ffa7eed2fd9..aea87f0d917b3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -428,6 +428,10 @@ enum { }; enum { + NVME_AER_ERROR = 0, + NVME_AER_SMART = 1, + NVME_AER_CSS = 6, + NVME_AER_VS = 7, NVME_AER_NOTICE_NS_CHANGED = 0x0002, NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, }; -- cgit v1.2.3 From 03e0f3a65e4da497c3b7b213c68943cbc73a2e34 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Nov 2017 19:32:07 +0800 Subject: nvme-pci: avoid dereference of symbol from unloaded module The 'remove_work' may be scheduled to run after nvme_remove() returns since we can't simply cancel it in nvme_remove() for avoiding deadlock. Once nvme_remove() returns, this module(nvme) can be unloaded. On the other hand, nvme_put_ctrl() calls ctr->ops->free_ctrl which may point to nvme_pci_free_ctrl() in unloaded module. This patch avoids this issue by queuing 'remove_work' via 'nvme_wq', and flush this worqueue in nvme_exit() as suggested by Sagi. Suggested-by: Sagi Grimberg Signed-off-by: Ming Lei Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 429d56f1a19e8..762b8402e04c5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2280,7 +2280,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); - if (!schedule_work(&dev->remove_work)) + if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -2703,6 +2703,7 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); + flush_workqueue(nvme_wq); _nvme_check_size(); } -- cgit v1.2.3 From 5e62d5c993e6889cd314d5b5de6b670152109a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 14:29:58 +0100 Subject: nvmet: better data length validation Currently the NVMe target stores the expexted data length in req->data_len and uses that for data transfer decisions, but that does not take the actual transfer length in the SGLs into account. So this adds a new transfer_len field, into which the transport drivers store the actual transfer length. We then check the two match before actually executing the command. The FC transport driver already had such a field, which is removed in favour of the common one. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 10 ++++++++++ drivers/nvme/target/fc.c | 32 ++++++++++++-------------------- drivers/nvme/target/loop.c | 3 ++- drivers/nvme/target/nvmet.h | 4 ++++ drivers/nvme/target/rdma.c | 10 ++++++---- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index da088293eafc1..22a2a2bb40f9e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -501,6 +501,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->ops = ops; req->sg = NULL; req->sg_cnt = 0; + req->transfer_len = 0; req->rsp->status = 0; /* no support for fused commands yet */ @@ -550,6 +551,15 @@ void nvmet_req_uninit(struct nvmet_req *req) } EXPORT_SYMBOL_GPL(nvmet_req_uninit); +void nvmet_req_execute(struct nvmet_req *req) +{ + if (unlikely(req->data_len != req->transfer_len)) + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + else + req->execute(req); +} +EXPORT_SYMBOL_GPL(nvmet_req_execute); + static inline bool nvmet_cc_en(u32 cc) { return (cc >> NVME_CC_EN_SHIFT) & 0x1; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 5f2570f5dfa90..739b8feadc7de 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod { dma_addr_t rspdma; struct scatterlist *data_sg; int data_sg_cnt; - u32 total_length; u32 offset; enum nvmet_fcp_datadir io_dir; bool active; @@ -1700,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) u32 page_len, length; int i = 0; - length = fod->total_length; + length = fod->req.transfer_len; nent = DIV_ROUND_UP(length, PAGE_SIZE); sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); if (!sg) @@ -1789,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, u32 rsn, rspcnt, xfr_length; if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) - xfr_length = fod->total_length; + xfr_length = fod->req.transfer_len; else xfr_length = fod->offset; @@ -1815,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, rspcnt = atomic_inc_return(&fod->queue->zrspcnt); if (!(rspcnt % fod->queue->ersp_ratio) || sqe->opcode == nvme_fabrics_command || - xfr_length != fod->total_length || + xfr_length != fod->req.transfer_len || (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) @@ -1892,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, - (fod->total_length - fod->offset)); + (fod->req.transfer_len - fod->offset)); fcpreq->transfer_length = tlen; fcpreq->transferred_length = 0; fcpreq->fcp_error = 0; @@ -1906,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, * combined xfr with response. */ if ((op == NVMET_FCOP_READDATA) && - ((fod->offset + fcpreq->transfer_length) == fod->total_length) && + ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) && (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { fcpreq->op = NVMET_FCOP_READDATA_RSP; nvmet_fc_prep_fcp_rsp(tgtport, fod); @@ -1986,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } fod->offset += fcpreq->transferred_length; - if (fod->offset != fod->total_length) { + if (fod->offset != fod->req.transfer_len) { spin_lock_irqsave(&fod->flock, flags); fod->writedataactive = true; spin_unlock_irqrestore(&fod->flock, flags); @@ -1998,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } /* data transfer complete, resume with nvmet layer */ - - fod->req.execute(&fod->req); - + nvmet_req_execute(&fod->req); break; case NVMET_FCOP_READDATA: @@ -2023,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } fod->offset += fcpreq->transferred_length; - if (fod->offset != fod->total_length) { + if (fod->offset != fod->req.transfer_len) { /* transfer the next chunk */ nvmet_fc_transfer_fcp_data(tgtport, fod, NVMET_FCOP_READDATA); @@ -2160,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; - fod->total_length = be32_to_cpu(cmdiu->data_len); + fod->req.transfer_len = be32_to_cpu(cmdiu->data_len); if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { fod->io_dir = NVMET_FCP_WRITE; if (!nvme_is_write(&cmdiu->sqe)) @@ -2171,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, goto transport_error; } else { fod->io_dir = NVMET_FCP_NODATA; - if (fod->total_length) + if (fod->req.transfer_len) goto transport_error; } @@ -2179,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->req.rsp = &fod->rspiubuf.cqe; fod->req.port = fod->queue->port; - /* ensure nvmet handlers will set cmd handler callback */ - fod->req.execute = NULL; - /* clear any response payload */ memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); @@ -2201,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); - if (fod->total_length) { + if (fod->req.transfer_len) { ret = nvmet_fc_alloc_tgt_pgs(fod); if (ret) { nvmet_req_complete(&fod->req, ret); @@ -2224,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, * can invoke the nvmet_layer now. If read data, cmd completion will * push the data */ - - fod->req.execute(&fod->req); - + nvmet_req_execute(&fod->req); return; transport_error: diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f40e70eb4a380..96d390416789b 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -127,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work) struct nvme_loop_iod *iod = container_of(work, struct nvme_loop_iod, work); - iod->req.execute(&iod->req); + nvmet_req_execute(&iod->req); } static enum blk_eh_timer_return @@ -176,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.transfer_len = blk_rq_bytes(req); } blk_mq_start_request(req); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e342f02845c19..194ebffc688c3 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -223,7 +223,10 @@ struct nvmet_req { struct bio inline_bio; struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; int sg_cnt; + /* data length as parsed from the command: */ size_t data_len; + /* data length as parsed from the SGL descriptor: */ + size_t transfer_len; struct nvmet_port *port; @@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); +void nvmet_req_execute(struct nvmet_req *req); void nvmet_req_complete(struct nvmet_req *req, u16 status); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3333d417b248b..49912909c2986 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p) static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && + rsp->req.transfer_len && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) { return !nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && + rsp->req.transfer_len && !rsp->req.rsp->status && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } @@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) return; } - rsp->req.execute(&rsp->req); + nvmet_req_execute(&rsp->req); } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, @@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) nvmet_rdma_use_inline_sg(rsp, len, off); rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + rsp->req.transfer_len += len; return 0; } @@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, nvmet_data_dir(&rsp->req)); if (ret < 0) return NVME_SC_INTERNAL; + rsp->req.transfer_len += len; rsp->n_rdma += ret; if (invalidate) { @@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { - rsp->req.execute(&rsp->req); + nvmet_req_execute(&rsp->req); } return true; -- cgit v1.2.3 From e454d122e22826589cfa08788a802ab09d4fae24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 14:29:27 +0100 Subject: nvmet: kill nvmet_inline_bio_init Much easier to just opencode this helper. Also use ARRAY_SIZE instead of passing the inline bvec array size manually. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index db632818777dd..0a4372a016f21 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) req->ns->blksize_shift; } -static void nvmet_inline_bio_init(struct nvmet_req *req) -{ - struct bio *bio = &req->inline_bio; - - bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC); -} - static void nvmet_execute_rw(struct nvmet_req *req) { int sg_cnt = req->sg_cnt; + struct bio *bio = &req->inline_bio; struct scatterlist *sg; - struct bio *bio; sector_t sector; blk_qc_t cookie; int op, op_flags = 0, i; @@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) sector = le64_to_cpu(req->cmd->rw.slba); sector <<= (req->ns->blksize_shift - 9); - nvmet_inline_bio_init(req); - bio = &req->inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio->bi_private = req; @@ -99,11 +91,9 @@ static void nvmet_execute_rw(struct nvmet_req *req) static void nvmet_execute_flush(struct nvmet_req *req) { - struct bio *bio; - - nvmet_inline_bio_init(req); - bio = &req->inline_bio; + struct bio *bio = &req->inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; -- cgit v1.2.3 From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Nov 2017 08:32:43 -0700 Subject: blk-mq: fix issue with shared tag queue re-running This patch attempts to make the case of hctx re-running on driver tag failure more robust. Without this patch, it's pretty easy to trigger a stall condition with shared tags. An example is using null_blk like this: modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1 which sets up 4 devices, sharing the same tag set with a depth of 1. Running a fio job ala: [global] bs=4k rw=randread norandommap direct=1 ioengine=libaio iodepth=4 [nullb0] filename=/dev/nullb0 [nullb1] filename=/dev/nullb1 [nullb2] filename=/dev/nullb2 [nullb3] filename=/dev/nullb3 will inevitably end with one or more threads being stuck waiting for a scheduler tag. That IO is then stuck forever, until someone else triggers a run of the queue. Ensure that we always re-run the hardware queue, if the driver tag we were waiting for got freed before we added our leftover request entries back on the dispatch list. Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 85 ++++++++++++++++++++++++++++---------------------- include/linux/blk-mq.h | 5 ++- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7f4a1ba532afc..bb7f084152033 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), - HCTX_STATE_NAME(TAG_WAITING), HCTX_STATE_NAME(START_ON_RUN), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 3d759bb8a5bb5..fed8165973a38 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,49 +998,64 @@ done: return rq->tag != -1; } -static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, - void *key) +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, + int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - list_del(&wait->entry); - clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, + struct request *rq) { + struct blk_mq_hw_ctx *this_hctx = *hctx; + wait_queue_entry_t *wait = &this_hctx->dispatch_wait; struct sbq_wait_state *ws; + if (!list_empty_careful(&wait->entry)) + return false; + + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } + + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* - * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. - * The thread which wins the race to grab this bit adds the hardware - * queue to the wait queue. + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. */ - if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || - test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_get_driver_tag(rq, hctx, false)) { + spin_unlock(&this_hctx->lock); return false; - - init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + } /* - * As soon as this returns, it's no longer safe to fiddle with - * hctx->dispatch_wait, since a completion can wake up the wait queue - * and unlock the bit. + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. */ - add_wait_queue(&ws->wait, &hctx->dispatch_wait); + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) + bool got_budget) { struct blk_mq_hw_ctx *hctx; struct request *rq, *nxt; + bool no_tag = false; int errors, queued; if (list_empty(list)) @@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!blk_mq_get_driver_tag(rq, &hctx, false)) { /* * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(hctx)) { - if (got_budget) - blk_mq_put_dispatch_budget(hctx); - break; - } - - /* - * It's possible that a tag was freed in the window - * between the allocation failure and adding the - * hardware queue to the wait queue. - */ - if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!blk_mq_dispatch_wait_add(&hctx, rq)) { if (got_budget) blk_mq_put_dispatch_budget(hctx); + no_tag = true; break; } } @@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * - * If TAG_WAITING is set that means that an I/O scheduler has - * been configured and another thread is waiting for a driver - * tag. To guarantee fairness, do not rerun this hardware queue - * but let the other thread grab the driver tag. + * If 'no_tag' is set, that means that we failed getting + * a driver tag with an I/O scheduler attached. If our dispatch + * waitqueue is no longer active, ensure that we run the queue + * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests @@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. */ - if (!blk_mq_sched_needs_restart(hctx) && - !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + if (!blk_mq_sched_needs_restart(hctx) || + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); } @@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); + if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 674641527da79..4ae987c2352cc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,7 +35,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_entry_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -181,8 +181,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, - BLK_MQ_S_TAG_WAITING = 3, - BLK_MQ_S_START_ON_RUN = 4, + BLK_MQ_S_START_ON_RUN = 3, BLK_MQ_MAX_DEPTH = 10240, -- cgit v1.2.3 From 055f6e18e08f5b7fd98171fce857a0bad87a919d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Nov 2017 10:49:53 -0800 Subject: block: Make q_usage_counter also track legacy requests This patch makes it possible to pause request allocation for the legacy block layer by calling blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). Signed-off-by: Ming Lei [ bvanassche: Combined two patches into one, edited a comment and made sure REQ_NOWAIT is handled properly in blk_old_get_request() ] Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++++++++ block/blk-mq.c | 10 ++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5e81dcf4690a6..a4362849059af 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -612,6 +612,9 @@ void blk_set_queue_dying(struct request_queue *q) } spin_unlock_irq(q->queue_lock); } + + /* Make blk_queue_enter() reexamine the DYING flag. */ + wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); @@ -1398,16 +1401,22 @@ static struct request *blk_old_get_request(struct request_queue *q, unsigned int op, gfp_t gfp_mask) { struct request *rq; + int ret = 0; WARN_ON_ONCE(q->mq_ops); /* create ioc upfront */ create_io_context(gfp_mask, q->node); + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || + (op & REQ_NOWAIT)); + if (ret) + return ERR_PTR(ret); spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); + blk_queue_exit(q); return rq; } @@ -1579,6 +1588,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) blk_free_request(rl, req); freed_request(rl, sync, rq_flags); blk_put_rl(rl); + blk_queue_exit(q); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1860,8 +1870,10 @@ get_rq: * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ + blk_queue_enter_live(q); req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; diff --git a/block/blk-mq.c b/block/blk-mq.c index fed8165973a38..7173d4bd64afd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -126,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - blk_mq_run_hw_queues(q, false); + if (q->mq_ops) + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -256,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); - - /* - * If we are called because the queue has now been marked as - * dying, we need to ensure that processes currently waiting on - * the queue are notified as well. - */ - wake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) -- cgit v1.2.3 From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:54 -0800 Subject: block: Introduce blk_get_request_flags() A side effect of this patch is that the GFP mask that is passed to several allocation functions in the legacy block layer is changed from GFP_KERNEL into __GFP_DIRECT_RECLAIM. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 50 +++++++++++++++++++++++++++++++++++--------------- include/linux/blkdev.h | 3 +++ 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a4362849059af..0f7093dfc0102 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1160,7 +1160,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * @rl: request list to allocate from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLQ_MQ_REQ_* flags * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. @@ -1170,7 +1170,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, unsigned int flags) { struct request_queue *q = rl->q; struct request *rq; @@ -1179,6 +1179,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, struct io_cq *icq = NULL; const bool is_sync = op_is_sync(op); int may_queue; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; req_flags_t rq_flags = RQF_ALLOCED; lockdep_assert_held(q->queue_lock); @@ -1339,7 +1341,7 @@ rq_starved: * @q: request_queue to allocate request from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) - * @gfp_mask: allocation mask + * @flags: BLK_MQ_REQ_* flags. * * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, * this function keeps retrying under memory pressure and fails iff @q is dead. @@ -1349,7 +1351,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio, unsigned int flags) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1361,7 +1363,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op, rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, bio, gfp_mask); + rq = __get_request(rl, op, bio, flags); if (!IS_ERR(rq)) return rq; @@ -1370,7 +1372,7 @@ retry: return ERR_PTR(-EAGAIN); } - if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { + if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1397,10 +1399,13 @@ retry: goto retry; } +/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, gfp_t gfp_mask) + unsigned int op, unsigned int flags) { struct request *rq; + gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : + __GFP_DIRECT_RECLAIM; int ret = 0; WARN_ON_ONCE(q->mq_ops); @@ -1413,7 +1418,7 @@ static struct request *blk_old_get_request(struct request_queue *q, if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, gfp_mask); + rq = get_request(q, op, NULL, flags); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); blk_queue_exit(q); @@ -1427,25 +1432,40 @@ static struct request *blk_old_get_request(struct request_queue *q, return rq; } -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) +/** + * blk_get_request_flags - allocate a request + * @q: request queue to allocate a request for + * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. + * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. + */ +struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, + unsigned int flags) { struct request *req; + WARN_ON_ONCE(op & REQ_NOWAIT); + WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT); + if (q->mq_ops) { - req = blk_mq_alloc_request(q, op, - (gfp_mask & __GFP_DIRECT_RECLAIM) ? - 0 : BLK_MQ_REQ_NOWAIT); + req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { - req = blk_old_get_request(q, op, gfp_mask); + req = blk_old_get_request(q, op, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } +EXPORT_SYMBOL(blk_get_request_flags); + +struct request *blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask) +{ + return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ? + 0 : BLK_MQ_REQ_NOWAIT); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1871,7 +1891,7 @@ get_rq: * Returns with the queue unlocked. */ blk_queue_enter_live(q); - req = get_request(q, bio->bi_opf, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, 0); if (IS_ERR(req)) { blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1437ef4d80379..1af5ddd0f6318 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); +extern struct request *blk_get_request_flags(struct request_queue *, + unsigned int op, + unsigned int flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); -- cgit v1.2.3 From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:55 -0800 Subject: block: Introduce BLK_MQ_REQ_PREEMPT Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to blk_get_request_flags(). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- block/blk-mq.c | 2 ++ include/linux/blk-mq.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0f7093dfc0102..17eed16a6e040 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1263,6 +1263,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, blk_rq_set_rl(rq, rl); rq->cmd_flags = op; rq->rq_flags = rq_flags; + if (flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; /* init elvpriv */ if (rq_flags & RQF_ELVPRIV) { @@ -1444,7 +1446,7 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); if (q->mq_ops) { req = blk_mq_alloc_request(q, op, flags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7173d4bd64afd..e21876778cecd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -291,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->q = data->q; rq->mq_ctx = data->ctx; rq->cmd_flags = op; + if (data->flags & BLK_MQ_REQ_PREEMPT) + rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4ae987c2352cc..82b56609736ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -212,6 +212,7 @@ enum { BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ + BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, -- cgit v1.2.3 From 039c635f4e666b647df2100038de276a83fb3fca Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:56 -0800 Subject: ide, scsi: Tell the block layer at request allocation time about preempt requests Convert blk_get_request(q, op, __GFP_RECLAIM) into blk_get_request_flags(q, op, BLK_MQ_PREEMPT). This patch does not change any functionality. Signed-off-by: Bart Van Assche Tested-by: Martin Steigerwald Acked-by: David S. Miller [ for IDE ] Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/ide/ide-pm.c | 4 ++-- drivers/scsi/scsi_lib.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 544f02d673ca2..f56d742908df6 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -89,9 +89,9 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN, + BLK_MQ_REQ_PREEMPT); ide_req(rq)->type = ATA_PRIV_PM_RESUME; - rq->rq_flags |= RQF_PREEMPT; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 286ea983c9e35..eb129dfc2ebef 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, struct scsi_request *rq; int ret = DRIVER_ERROR << 24; - req = blk_get_request(sdev->request_queue, + req = blk_get_request_flags(sdev->request_queue, data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT); if (IS_ERR(req)) return ret; rq = scsi_req(req); @@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, rq->retries = retries; req->timeout = timeout; req->cmd_flags |= flags; - req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; + req->rq_flags |= rq_flags | RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work -- cgit v1.2.3 From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:57 -0800 Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag This flag will be used in the next patch to let the block layer core know whether or not a SCSI request queue has been quiesced. A quiesced SCSI queue namely only processes RQF_PREEMPT requests. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 30 ++++++++++++++++++++++++++++++ block/blk-mq-debugfs.c | 1 + include/linux/blkdev.h | 6 ++++++ 3 files changed, 37 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 17eed16a6e040..edc2768991165 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -348,6 +348,36 @@ void blk_sync_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_sync_queue); +/** + * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY + * @q: request queue pointer + * + * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not + * set and 1 if the flag was already set. + */ +int blk_set_preempt_only(struct request_queue *q) +{ + unsigned long flags; + int res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +EXPORT_SYMBOL_GPL(blk_set_preempt_only); + +void blk_clear_preempt_only(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(blk_clear_preempt_only); + /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index bb7f084152033..e4f2bb936e663 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -74,6 +74,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), + QUEUE_FLAG_NAME(PREEMPT_ONLY), }; #undef QUEUE_FLAG_NAME diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1af5ddd0f6318..2147e2381a22c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -632,6 +632,7 @@ struct request_queue { #define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ +#define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ @@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) +#define blk_queue_preempt_only(q) \ + test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) + +extern int blk_set_preempt_only(struct request_queue *q); +extern void blk_clear_preempt_only(struct request_queue *q); static inline bool blk_account_rq(struct request *rq) { -- cgit v1.2.3 From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:58 -0800 Subject: block, scsi: Make SCSI quiesce and resume work reliably The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$( sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Martin K. Petersen Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++-------- block/blk-mq.c | 4 ++-- drivers/scsi/scsi_lib.c | 42 ++++++++++++++++++++++++++++++------------ fs/block_dev.c | 4 ++-- include/linux/blkdev.h | 2 +- include/scsi/scsi_device.h | 1 + 6 files changed, 70 insertions(+), 25 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index edc2768991165..29b08428ae459 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q) spin_lock_irqsave(q->queue_lock, flags); queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + wake_up_all(&q->mq_freeze_wq); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_clear_preempt_only); @@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, bool nowait) +/** + * blk_queue_enter() - try to increase q->q_usage_counter + * @q: request queue pointer + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + */ +int blk_queue_enter(struct request_queue *q, unsigned int flags) { + const bool preempt = flags & BLK_MQ_REQ_PREEMPT; + while (true) { + bool success = false; int ret; - if (percpu_ref_tryget_live(&q->q_usage_counter)) + rcu_read_lock_sched(); + if (percpu_ref_tryget_live(&q->q_usage_counter)) { + /* + * The code that sets the PREEMPT_ONLY flag is + * responsible for ensuring that that flag is globally + * visible before the queue is unfrozen. + */ + if (preempt || !blk_queue_preempt_only(q)) { + success = true; + } else { + percpu_ref_put(&q->q_usage_counter); + } + } + rcu_read_unlock_sched(); + + if (success) return 0; - if (nowait) + if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* @@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait) smp_rmb(); ret = wait_event_interruptible(q->mq_freeze_wq, - !atomic_read(&q->mq_freeze_depth) || + (atomic_read(&q->mq_freeze_depth) == 0 && + (preempt || !blk_queue_preempt_only(q))) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; @@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q, /* create ioc upfront */ create_io_context(gfp_mask, q->node); - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) || - (op & REQ_NOWAIT)); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); @@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; + unsigned int flags = bio->bi_opf & REQ_NOWAIT ? + BLK_MQ_REQ_NOWAIT : 0; - if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { + if (likely(blk_queue_enter(q, flags) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio) if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; - if (unlikely(blk_queue_enter(q, nowait))) { + if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) bio->bi_status = BLK_STS_AGAIN; else diff --git a/block/blk-mq.c b/block/blk-mq.c index e21876778cecd..211bc8a3e2cc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, struct request *rq; int ret; - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); @@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); - ret = blk_queue_enter(q, true); + ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eb129dfc2ebef..f907e2f8c1ddb 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev) int scsi_device_quiesce(struct scsi_device *sdev) { + struct request_queue *q = sdev->request_queue; int err; + /* + * It is allowed to call scsi_device_quiesce() multiple times from + * the same context but concurrent scsi_device_quiesce() calls are + * not allowed. + */ + WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); + + blk_set_preempt_only(q); + + blk_mq_freeze_queue(q); + /* + * Ensure that the effect of blk_set_preempt_only() will be visible + * for percpu_ref_tryget() callers that occur after the queue + * unfreeze even if the queue was already frozen before this function + * was called. See also https://lwn.net/Articles/573497/. + */ + synchronize_rcu(); + blk_mq_unfreeze_queue(q); + mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); + if (err == 0) + sdev->quiesced_by = current; + else + blk_clear_preempt_only(q); mutex_unlock(&sdev->state_mutex); - if (err) - return err; - - scsi_run_queue(sdev->request_queue); - while (atomic_read(&sdev->device_busy)) { - msleep_interruptible(200); - scsi_run_queue(sdev->request_queue); - } - return 0; + return err; } EXPORT_SYMBOL(scsi_device_quiesce); @@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - if (sdev->sdev_state == SDEV_QUIESCE && - scsi_device_set_state(sdev, SDEV_RUNNING) == 0) - scsi_run_queue(sdev->request_queue); + WARN_ON_ONCE(!sdev->quiesced_by); + sdev->quiesced_by = NULL; + blk_clear_preempt_only(sdev->request_queue); + if (sdev->sdev_state == SDEV_QUIESCE) + scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); diff --git a/fs/block_dev.c b/fs/block_dev.c index 4afa4d5ff969e..04973f4844224 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); @@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2147e2381a22c..402c9d536ae10 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, bool nowait); +extern int blk_queue_enter(struct request_queue *q, unsigned int flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 82e93ee94708c..6f0f1e242e236 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -219,6 +219,7 @@ struct scsi_device { unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; + struct task_struct *quiesced_by; unsigned long sdev_data[0]; } __attribute__((aligned(sizeof(unsigned long)))); -- cgit v1.2.3 From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:59 -0800 Subject: block, nvme: Introduce blk_mq_req_flags_t Several block layer and NVMe core functions accept a combination of BLK_MQ_REQ_* flags through the 'flags' argument but there is no verification at compile time whether the right type of block layer flags is passed. Make it possible for sparse to verify this. This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: linux-nvme@lists.infradead.org Cc: Christoph Hellwig Cc: Johannes Thumshirn Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++------ block/blk-mq.c | 4 ++-- block/blk-mq.h | 2 +- drivers/nvme/host/core.c | 5 +++-- drivers/nvme/host/nvme.h | 5 +++-- include/linux/blk-mq.h | 17 +++++++++++------ include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 4 ++-- 8 files changed, 30 insertions(+), 21 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 29b08428ae459..7c54c195e79e8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -801,7 +801,7 @@ EXPORT_SYMBOL(blk_alloc_queue); * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT */ -int blk_queue_enter(struct request_queue *q, unsigned int flags) +int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool preempt = flags & BLK_MQ_REQ_PREEMPT; @@ -1225,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, unsigned int flags) + struct bio *bio, blk_mq_req_flags_t flags) { struct request_queue *q = rl->q; struct request *rq; @@ -1408,7 +1408,7 @@ rq_starved: * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, unsigned int flags) + struct bio *bio, blk_mq_req_flags_t flags) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); @@ -1458,7 +1458,7 @@ retry: /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, unsigned int flags) + unsigned int op, blk_mq_req_flags_t flags) { struct request *rq; gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : @@ -1495,7 +1495,7 @@ static struct request *blk_old_get_request(struct request_queue *q, * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op, - unsigned int flags) + blk_mq_req_flags_t flags) { struct request *req; @@ -2291,7 +2291,7 @@ blk_qc_t generic_make_request(struct bio *bio) current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; - unsigned int flags = bio->bi_opf & REQ_NOWAIT ? + blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ? BLK_MQ_REQ_NOWAIT : 0; if (likely(blk_queue_enter(q, flags) == 0)) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 211bc8a3e2cc4..bfe24a5b62a36 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -383,7 +383,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags) + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; @@ -409,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx) + unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; diff --git a/block/blk-mq.h b/block/blk-mq.h index 2502f40ccdc07..99a19c5523e2b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -111,7 +111,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - unsigned int flags; + blk_mq_req_flags_t flags; unsigned int shallow_depth; /* input & output parameter */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 683d890d73fa3..878d5c09d15b2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns) } struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid) + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; struct request *req; @@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags) + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags) { struct request *req; int ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7b9cc7d616b75..23b8504ace7fa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid); + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags); + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 82b56609736ab..b326208277ee4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate internal/sched tag */ - BLK_MQ_REQ_PREEMPT = (1 << 3), /* set RQF_PREEMPT */ + /* return when out of requests */ + BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), + /* allocate from reserved pool */ + BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), + /* allocate internal/sched tag */ + BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), + /* set RQF_PREEMPT */ + BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, unsigned int flags, unsigned int hctx_idx); + unsigned int op, blk_mq_req_flags_t flags, + unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); enum { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b04085255fb3..13ccfc9b210ac 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -162,6 +162,8 @@ struct bio { */ #define BIO_RESET_BITS BVEC_POOL_OFFSET +typedef __u32 __bitwise blk_mq_req_flags_t; + /* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 402c9d536ae10..e80ea1d31343d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -935,7 +935,7 @@ extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request_flags(struct request_queue *, unsigned int op, - unsigned int flags); + blk_mq_req_flags_t flags); extern struct request *blk_get_request(struct request_queue *, unsigned int op, gfp_t gfp_mask); extern void blk_requeue_request(struct request_queue *, struct request *); @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, unsigned int flags); +extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_start_queue_async(struct request_queue *q); -- cgit v1.2.3 From ab9e00cc72fa4c6eb6370757b9e94c4645e91d5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:48:55 +0100 Subject: nvme: track subsystems This adds a new nvme_subsystem structure so that we can track multiple controllers that belong to a single subsystem. For now we only use it to store the NQN, and to check that we don't have duplicate NQNs unless the involved subsystems support multiple controllers. Includes code originally from Hannes Reinecke to expose the subsystems in sysfs. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 210 ++++++++++++++++++++++++++++++++++++++------ drivers/nvme/host/fabrics.c | 4 +- drivers/nvme/host/nvme.h | 27 ++++-- 3 files changed, 205 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 878d5c09d15b2..78dc6f624d52f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -68,9 +68,14 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); +static DEFINE_IDA(nvme_subsystems_ida); +static LIST_HEAD(nvme_subsystems); +static DEFINE_MUTEX(nvme_subsystems_lock); + static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_chr_devt; static struct class *nvme_class; +static struct class *nvme_subsys_class; static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); @@ -1804,14 +1809,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, string_matches(id->fr, q->fr, sizeof(id->fr)); } -static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) { size_t nqnlen; int off; nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE); + strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } @@ -1819,14 +1825,140 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ - off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, + off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, "nqn.2014.08.org.nvmexpress:%4x%4x", le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); - memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); + memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); off += sizeof(id->sn); - memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); + memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); off += sizeof(id->mn); - memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); + memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); +} + +static void __nvme_release_subsystem(struct nvme_subsystem *subsys) +{ + ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + kfree(subsys); +} + +static void nvme_release_subsystem(struct device *dev) +{ + __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev)); +} + +static void nvme_destroy_subsystem(struct kref *ref) +{ + struct nvme_subsystem *subsys = + container_of(ref, struct nvme_subsystem, ref); + + mutex_lock(&nvme_subsystems_lock); + list_del(&subsys->entry); + mutex_unlock(&nvme_subsystems_lock); + + device_del(&subsys->dev); + put_device(&subsys->dev); +} + +static void nvme_put_subsystem(struct nvme_subsystem *subsys) +{ + kref_put(&subsys->ref, nvme_destroy_subsystem); +} + +static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) +{ + struct nvme_subsystem *subsys; + + lockdep_assert_held(&nvme_subsystems_lock); + + list_for_each_entry(subsys, &nvme_subsystems, entry) { + if (strcmp(subsys->subnqn, subsysnqn)) + continue; + if (!kref_get_unless_zero(&subsys->ref)) + continue; + return subsys; + } + + return NULL; +} + +static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + struct nvme_subsystem *subsys, *found; + int ret; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return -ENOMEM; + ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); + if (ret < 0) { + kfree(subsys); + return ret; + } + subsys->instance = ret; + mutex_init(&subsys->lock); + kref_init(&subsys->ref); + INIT_LIST_HEAD(&subsys->ctrls); + nvme_init_subnqn(subsys, ctrl, id); + memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); + memcpy(subsys->model, id->mn, sizeof(subsys->model)); + memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); + subsys->vendor_id = le16_to_cpu(id->vid); + subsys->cmic = id->cmic; + + subsys->dev.class = nvme_subsys_class; + subsys->dev.release = nvme_release_subsystem; + dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + device_initialize(&subsys->dev); + + mutex_lock(&nvme_subsystems_lock); + found = __nvme_find_get_subsystem(subsys->subnqn); + if (found) { + /* + * Verify that the subsystem actually supports multiple + * controllers, else bail out. + */ + if (!(id->cmic & (1 << 1))) { + dev_err(ctrl->device, + "ignoring ctrl due to duplicate subnqn (%s).\n", + found->subnqn); + nvme_put_subsystem(found); + ret = -EINVAL; + goto out_unlock; + } + + __nvme_release_subsystem(subsys); + subsys = found; + } else { + ret = device_add(&subsys->dev); + if (ret) { + dev_err(ctrl->device, + "failed to register subsystem device.\n"); + goto out_unlock; + } + list_add_tail(&subsys->entry, &nvme_subsystems); + } + + ctrl->subsys = subsys; + mutex_unlock(&nvme_subsystems_lock); + + if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device))) { + dev_err(ctrl->device, + "failed to create sysfs link from subsystem.\n"); + /* the transport driver will eventually put the subsystem */ + return -EINVAL; + } + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + return 0; + +out_unlock: + mutex_unlock(&nvme_subsystems_lock); + put_device(&subsys->dev); + return ret; } static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, @@ -1901,9 +2033,13 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } - nvme_init_subnqn(ctrl, id); - if (!ctrl->identified) { + int i; + + ret = nvme_init_subsystem(ctrl, id); + if (ret) + goto out_free; + /* * Check for quirks. Quirk can depend on firmware version, * so, in principle, the set of quirks present can change @@ -1912,9 +2048,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) * the device, but we'd have to make sure that the driver * behaves intelligently if the quirks change. */ - - int i; - for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { if (quirk_matches(id, &core_quirks[i])) ctrl->quirks |= core_quirks[i].quirks; @@ -1927,14 +2060,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } ctrl->oacs = le16_to_cpu(id->oacs); - ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; ctrl->cntlid = le16_to_cpup(&id->cntlid); - memcpy(ctrl->serial, id->sn, sizeof(id->sn)); - memcpy(ctrl->model, id->mn, sizeof(id->mn)); - memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); if (id->mdts) max_hw_sectors = 1 << (id->mdts + page_shift - 9); else @@ -2137,9 +2266,9 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ctrl *ctrl = ns->ctrl; - int serial_len = sizeof(ctrl->serial); - int model_len = sizeof(ctrl->model); + struct nvme_subsystem *subsys = ns->ctrl->subsys; + int serial_len = sizeof(subsys->serial); + int model_len = sizeof(subsys->model); if (!uuid_is_null(&ns->uuid)) return sprintf(buf, "uuid.%pU\n", &ns->uuid); @@ -2150,15 +2279,16 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) return sprintf(buf, "eui.%8phN\n", ns->eui); - while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' || - ctrl->serial[serial_len - 1] == '\0')) + while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || + subsys->serial[serial_len - 1] == '\0')) serial_len--; - while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' || - ctrl->model[model_len - 1] == '\0')) + while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || + subsys->model[model_len - 1] == '\0')) model_len--; - return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, - serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); + return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + serial_len, subsys->serial, model_len, subsys->model, + ns->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); @@ -2244,10 +2374,15 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ + return sprintf(buf, "%.*s\n", \ + (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); + #define nvme_show_int_function(field) \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -2257,9 +2392,6 @@ static ssize_t field##_show(struct device *dev, \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); -nvme_show_str_function(model); -nvme_show_str_function(serial); -nvme_show_str_function(firmware_rev); nvme_show_int_function(cntlid); static ssize_t nvme_sysfs_delete(struct device *dev, @@ -2313,7 +2445,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -2817,12 +2949,23 @@ static void nvme_free_ctrl(struct device *dev) { struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvme_subsystem *subsys = ctrl->subsys; ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); kfree(ctrl->effects); + if (subsys) { + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); + } + ctrl->ops->free_ctrl(ctrl); + + if (subsys) + nvme_put_subsystem(subsys); } /* @@ -3022,8 +3165,15 @@ int __init nvme_core_init(void) goto unregister_chrdev; } + nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); + if (IS_ERR(nvme_subsys_class)) { + result = PTR_ERR(nvme_subsys_class); + goto destroy_class; + } return 0; +destroy_class: + class_destroy(nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_wq: @@ -3033,6 +3183,8 @@ destroy_wq: void nvme_core_exit(void) { + ida_destroy(&nvme_subsystems_ida); + class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_workqueue(nvme_wq); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a4967de5bb255..76b4fe6816a03 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -882,10 +882,10 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } - if (strcmp(ctrl->subnqn, opts->subsysnqn)) { + if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", - ctrl->subnqn); + ctrl->subsys->subnqn); up_read(&nvmf_transports_rwsem); nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 23b8504ace7fa..6ae50979c3aa0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -140,13 +140,12 @@ struct nvme_ctrl { struct work_struct reset_work; struct work_struct delete_work; + struct nvme_subsystem *subsys; + struct list_head subsys_entry; + struct opal_dev *opal_dev; char name[12]; - char serial[20]; - char model[40]; - char firmware_rev[8]; - char subnqn[NVMF_NQN_SIZE]; u16 cntlid; u32 ctrl_config; @@ -157,7 +156,6 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u16 oncs; - u16 vid; u16 oacs; u16 nssa; u16 nr_streams; @@ -200,6 +198,25 @@ struct nvme_ctrl { struct nvmf_ctrl_options *opts; }; +struct nvme_subsystem { + int instance; + struct device dev; + /* + * Because we unregister the device on the last put we need + * a separate refcount. + */ + struct kref ref; + struct list_head entry; + struct mutex lock; + struct list_head ctrls; + char subnqn[NVMF_NQN_SIZE]; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u8 cmic; + u16 vendor_id; +}; + struct nvme_ns { struct list_head list; -- cgit v1.2.3 From 002fab040468f5b279f8d8d49d487e73b0c3f98a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:50:16 +0100 Subject: nvme: introduce a nvme_ns_ids structure This allows us to manage the various uniqueue namespace identifiers together instead needing various variables and arguments. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 69 +++++++++++++++++++++++++++--------------------- drivers/nvme/host/nvme.h | 14 +++++++--- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 78dc6f624d52f..04c0949c754ac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -797,7 +797,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) } static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, - u8 *eui64, u8 *nguid, uuid_t *uuid) + struct nvme_ns_ids *ids) { struct nvme_command c = { }; int status; @@ -833,7 +833,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_EUI64_LEN; - memcpy(eui64, data + pos + sizeof(*cur), len); + memcpy(ids->eui64, data + pos + sizeof(*cur), len); break; case NVME_NIDT_NGUID: if (cur->nidl != NVME_NIDT_NGUID_LEN) { @@ -843,7 +843,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_NGUID_LEN; - memcpy(nguid, data + pos + sizeof(*cur), len); + memcpy(ids->nguid, data + pos + sizeof(*cur), len); break; case NVME_NIDT_UUID: if (cur->nidl != NVME_NIDT_UUID_LEN) { @@ -853,7 +853,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_UUID_LEN; - uuid_copy(uuid, data + pos + sizeof(*cur)); + uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); break; default: /* Skip unnkown types */ @@ -1233,22 +1233,31 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, - struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) + struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + memset(ids, 0, sizeof(*ids)); + if (ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(eui64, id->eui64, sizeof(id->eui64)); + memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(nguid, id->nguid, sizeof(id->nguid)); + memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); if (ctrl->vs >= NVME_VS(1, 3, 0)) { /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) + if (nvme_identify_ns_descs(ctrl, nsid, ids)) dev_warn(ctrl->device, "%s: Identify Descriptors failed\n", __func__); } } +static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) +{ + return uuid_equal(&a->uuid, &b->uuid) && + memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { @@ -1304,8 +1313,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id; - u8 eui64[8] = { 0 }, nguid[16] = { 0 }; - uuid_t uuid = uuid_null; + struct nvme_ns_ids ids; int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { @@ -1322,10 +1330,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); - if (!uuid_equal(&ns->uuid, &uuid) || - memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) || - memcmp(&ns->eui, &eui64, sizeof(ns->eui))) { + nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids); + if (!nvme_ns_ids_equal(&ns->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->ns_id); ret = -ENODEV; @@ -2266,18 +2272,19 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; struct nvme_subsystem *subsys = ns->ctrl->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); - if (!uuid_is_null(&ns->uuid)) - return sprintf(buf, "uuid.%pU\n", &ns->uuid); + if (!uuid_is_null(&ids->uuid)) + return sprintf(buf, "uuid.%pU\n", &ids->uuid); - if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) - return sprintf(buf, "eui.%16phN\n", ns->nguid); + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return sprintf(buf, "eui.%16phN\n", ids->nguid); - if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) - return sprintf(buf, "eui.%8phN\n", ns->eui); + if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return sprintf(buf, "eui.%8phN\n", ids->eui64); while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || subsys->serial[serial_len - 1] == '\0')) @@ -2296,7 +2303,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->nguid); + return sprintf(buf, "%pU\n", ns->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); @@ -2304,16 +2311,17 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set */ - if (uuid_is_null(&ns->uuid)) { + if (uuid_is_null(&ids->uuid)) { printk_ratelimited(KERN_WARNING "No UUID available providing old NGUID\n"); - return sprintf(buf, "%pU\n", ns->nguid); + return sprintf(buf, "%pU\n", ids->nguid); } - return sprintf(buf, "%pU\n", &ns->uuid); + return sprintf(buf, "%pU\n", &ids->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -2321,7 +2329,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->eui); + return sprintf(buf, "%8ph\n", ns->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -2347,18 +2355,19 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ns->uuid) || - !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + if (uuid_is_null(&ids->uuid) || + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } if (a == &dev_attr_nguid.attr) { - if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } if (a == &dev_attr_eui.attr) { - if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) return 0; } return a->mode; @@ -2591,7 +2600,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (id->ncap == 0) goto out_free_id; - nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6ae50979c3aa0..2f1af915f93a5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -217,6 +217,15 @@ struct nvme_subsystem { u16 vendor_id; }; +/* + * Container structure for uniqueue namespace identifiers. + */ +struct nvme_ns_ids { + u8 eui64[8]; + u8 nguid[16]; + uuid_t uuid; +}; + struct nvme_ns { struct list_head list; @@ -227,11 +236,8 @@ struct nvme_ns { struct kref kref; int instance; - u8 eui[8]; - u8 nguid[16]; - uuid_t uuid; - unsigned ns_id; + struct nvme_ns_ids ids; int lba_shift; u16 ms; u16 sgs; -- cgit v1.2.3 From ed754e5deeb17f4e675c84e4b6c640cc7344e498 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:50:43 +0100 Subject: nvme: track shared namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new struct nvme_ns_head that holds information about an actual namespace, unlike struct nvme_ns, which only holds the per-controller namespace information. For private namespaces there is a 1:1 relation of the two, but for shared namespaces this lets us discover all the paths to it. For now only the identifiers are moved to the new structure, but most of the information in struct nvme_ns should eventually move over. To allow lockless path lookup the list of nvme_ns structures per nvme_ns_head is protected by SRCU, which requires freeing the nvme_ns structure through call_srcu. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 220 +++++++++++++++++++++++++++++++++++-------- drivers/nvme/host/lightnvm.c | 14 +-- drivers/nvme/host/nvme.h | 26 ++++- 3 files changed, 210 insertions(+), 50 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 04c0949c754ac..13676f6cd4f67 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -291,6 +291,22 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); +static void nvme_free_ns_head(struct kref *ref) +{ + struct nvme_ns_head *head = + container_of(ref, struct nvme_ns_head, ref); + + ida_simple_remove(&head->subsys->ns_ida, head->instance); + list_del_init(&head->entry); + cleanup_srcu_struct(&head->srcu); + kfree(head); +} + +static void nvme_put_ns_head(struct nvme_ns_head *head) +{ + kref_put(&head->ref, nvme_free_ns_head); +} + static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); @@ -299,7 +315,7 @@ static void nvme_free_ns(struct kref *kref) nvme_nvm_unregister(ns); put_disk(ns->disk); - ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); + nvme_put_ns_head(ns->head); nvme_put_ctrl(ns->ctrl); kfree(ns); } @@ -435,7 +451,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, { memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); + cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, @@ -466,7 +482,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); @@ -495,7 +511,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, memset(cmnd, 0, sizeof(*cmnd)); cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -987,7 +1003,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) memset(&c, 0, sizeof(c)); c.rw.opcode = io.opcode; c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.nsid = cpu_to_le32(ns->head->ns_id); c.rw.slba = cpu_to_le64(io.slba); c.rw.length = cpu_to_le16(io.nblocks); c.rw.control = cpu_to_le16(io.control); @@ -1130,7 +1146,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); - return ns->ns_id; + return ns->head->ns_id; case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: @@ -1251,6 +1267,13 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, } } +static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) +{ + return !uuid_is_null(&ids->uuid) || + memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || + memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); +} + static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -1321,7 +1344,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->ns_id); + id = nvme_identify_ns(ctrl, ns->head->ns_id); if (!id) return -ENODEV; @@ -1330,10 +1353,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids); - if (!nvme_ns_ids_equal(&ns->ids, &ids)) { + nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, - "identifiers changed for nsid %d\n", ns->ns_id); + "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; } @@ -1374,7 +1397,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.nsid = cpu_to_le32(ns->head->ns_id); c.common.cdw10[0] = cpu_to_le32(cdw10); return nvme_submit_sync_cmd(ns->queue, &c, data, 16); @@ -1861,6 +1884,7 @@ static void nvme_destroy_subsystem(struct kref *ref) list_del(&subsys->entry); mutex_unlock(&nvme_subsystems_lock); + ida_destroy(&subsys->ns_ida); device_del(&subsys->dev); put_device(&subsys->dev); } @@ -1904,6 +1928,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); + INIT_LIST_HEAD(&subsys->nsheads); nvme_init_subnqn(subsys, ctrl, id); memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); memcpy(subsys->model, id->mn, sizeof(subsys->model)); @@ -1941,6 +1966,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) "failed to register subsystem device.\n"); goto out_unlock; } + ida_init(&subsys->ns_ida); list_add_tail(&subsys->entry, &nvme_subsystems); } @@ -2272,7 +2298,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; struct nvme_subsystem *subsys = ns->ctrl->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); @@ -2295,7 +2321,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, - ns->ns_id); + ns->head->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); @@ -2303,7 +2329,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->ids.nguid); + return sprintf(buf, "%pU\n", ns->head->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); @@ -2311,7 +2337,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set @@ -2329,7 +2355,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->ids.eui64); + return sprintf(buf, "%8ph\n", ns->head->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -2337,7 +2363,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%d\n", ns->ns_id); + return sprintf(buf, "%d\n", ns->head->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); @@ -2355,7 +2381,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; if (a == &dev_attr_uuid.attr) { if (uuid_is_null(&ids->uuid) || @@ -2507,12 +2533,124 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; +static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, + unsigned nsid) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) + return h; + } + + return NULL; +} + +static int __nvme_check_ids(struct nvme_subsystem *subsys, + struct nvme_ns_head *new) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (nvme_ns_ids_valid(&new->ids) && + nvme_ns_ids_equal(&new->ids, &h->ids)) + return -EINVAL; + } + + return 0; +} + +static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns *id) +{ + struct nvme_ns_head *head; + int ret = -ENOMEM; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto out; + ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_head; + head->instance = ret; + INIT_LIST_HEAD(&head->list); + init_srcu_struct(&head->srcu); + head->subsys = ctrl->subsys; + head->ns_id = nsid; + kref_init(&head->ref); + + nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + + ret = __nvme_check_ids(ctrl->subsys, head); + if (ret) { + dev_err(ctrl->device, + "duplicate IDs for nsid %d\n", nsid); + goto out_cleanup_srcu; + } + + list_add_tail(&head->entry, &ctrl->subsys->nsheads); + return head; +out_cleanup_srcu: + cleanup_srcu_struct(&head->srcu); + ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); +out_free_head: + kfree(head); +out: + return ERR_PTR(ret); +} + +static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, + struct nvme_id_ns *id, bool *new) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + bool is_shared = id->nmic & (1 << 0); + struct nvme_ns_head *head = NULL; + int ret = 0; + + mutex_lock(&ctrl->subsys->lock); + if (is_shared) + head = __nvme_find_ns_head(ctrl->subsys, nsid); + if (!head) { + head = nvme_alloc_ns_head(ctrl, nsid, id); + if (IS_ERR(head)) { + ret = PTR_ERR(head); + goto out_unlock; + } + + *new = true; + } else { + struct nvme_ns_ids ids; + + nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (!nvme_ns_ids_equal(&head->ids, &ids)) { + dev_err(ctrl->device, + "IDs don't match for shared namespace %d\n", + nsid); + ret = -EINVAL; + goto out_unlock; + } + + *new = false; + } + + list_add_tail(&ns->siblings, &head->list); + ns->head = head; + +out_unlock: + mutex_unlock(&ctrl->subsys->lock); + return ret; +} + static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); - return nsa->ns_id - nsb->ns_id; + return nsa->head->ns_id - nsb->head->ns_id; } static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) @@ -2521,13 +2659,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->ns_id == nsid) { + if (ns->head->ns_id == nsid) { if (!kref_get_unless_zero(&ns->kref)) continue; ret = ns; break; } - if (ns->ns_id > nsid) + if (ns->head->ns_id > nsid) break; } mutex_unlock(&ctrl->namespaces_mutex); @@ -2542,7 +2680,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) if (!ctrl->nr_streams) return 0; - ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); + ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); if (ret) return ret; @@ -2567,32 +2705,26 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; int node = dev_to_node(ctrl->dev); + bool new = true; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return; - ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); - if (ns->instance < 0) - goto out_free_ns; - ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) - goto out_release_instance; + goto out_free_ns; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); ns->queue->queuedata = ns; ns->ctrl = ctrl; kref_init(&ns->kref); - ns->ns_id = nsid; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); nvme_setup_streams_ns(ctrl, ns); - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); - id = nvme_identify_ns(ctrl, nsid); if (!id) goto out_free_queue; @@ -2600,18 +2732,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (id->ncap == 0) goto out_free_id; - nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids); + if (nvme_init_ns_head(ns, nsid, id, &new)) + goto out_free_id; + + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_free_id; + goto out_unlink_ns; } } disk = alloc_disk_node(0, node); if (!disk) - goto out_free_id; + goto out_unlink_ns; disk->fops = &nvme_fops; disk->private_data = ns; @@ -2639,18 +2774,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); return; + out_unlink_ns: + mutex_lock(&ctrl->subsys->lock); + list_del_rcu(&ns->siblings); + mutex_unlock(&ctrl->subsys->lock); out_free_id: kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); - out_release_instance: - ida_simple_remove(&ctrl->ns_ida, ns->instance); out_free_ns: kfree(ns); } static void nvme_ns_remove(struct nvme_ns *ns) { + struct nvme_ns_head *head = ns->head; + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; @@ -2665,10 +2804,16 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->subsys->lock); + if (head) + list_del_rcu(&ns->siblings); + mutex_unlock(&ns->ctrl->subsys->lock); + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); mutex_unlock(&ns->ctrl->namespaces_mutex); + synchronize_srcu(&head->srcu); nvme_put_ns(ns); } @@ -2691,7 +2836,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->ns_id > nsid) + if (ns->head->ns_id > nsid) nvme_ns_remove(ns); } } @@ -2961,7 +3106,6 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; ida_simple_remove(&nvme_instance_ida, ctrl->instance); - ida_destroy(&ctrl->ns_ida); kfree(ctrl->effects); if (subsys) { @@ -3022,8 +3166,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_free_name; - ida_init(&ctrl->ns_ida); - /* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST. diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8fc949c5b49b7..ba3d7f3349e5d 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) int ret; c.identity.opcode = nvme_nvm_admin_identity; - c.identity.nsid = cpu_to_le32(ns->ns_id); + c.identity.nsid = cpu_to_le32(ns->head->ns_id); c.identity.chnl_off = 0; nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); @@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, int ret = 0; c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; - c.l2p.nsid = cpu_to_le32(ns->ns_id); + c.l2p.nsid = cpu_to_le32(ns->head->ns_id); entries = kmalloc(len, GFP_KERNEL); if (!entries) return -ENOMEM; @@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; - c.get_bb.nsid = cpu_to_le32(ns->ns_id); + c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); c.get_bb.spba = cpu_to_le64(ppa.ppa); bb_tbl = kzalloc(tblsz, GFP_KERNEL); @@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, int ret = 0; c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; - c.set_bb.nsid = cpu_to_le32(ns->ns_id); + c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); c.set_bb.spba = cpu_to_le64(ppas->ppa); c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); c.set_bb.value = type; @@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, struct nvme_nvm_command *c) { c->ph_rw.opcode = rqd->opcode; - c->ph_rw.nsid = cpu_to_le32(ns->ns_id); + c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); @@ -731,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns, memset(&c, 0, sizeof(c)); c.ph_rw.opcode = vio.opcode; - c.ph_rw.nsid = cpu_to_le32(ns->ns_id); + c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); c.ph_rw.control = cpu_to_le16(vio.control); c.ph_rw.length = cpu_to_le16(vio.nppas); @@ -768,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, memset(&c, 0, sizeof(c)); c.common.opcode = vcmd.opcode; - c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.nsid = cpu_to_le32(ns->head->ns_id); c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); /* cdw11-12 */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2f1af915f93a5..6e5004b009753 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -136,7 +136,6 @@ struct nvme_ctrl { struct device ctrl_device; struct device *device; /* char device */ struct cdev cdev; - struct ida ns_ida; struct work_struct reset_work; struct work_struct delete_work; @@ -209,12 +208,14 @@ struct nvme_subsystem { struct list_head entry; struct mutex lock; struct list_head ctrls; + struct list_head nsheads; char subnqn[NVMF_NQN_SIZE]; char serial[20]; char model[40]; char firmware_rev[8]; u8 cmic; u16 vendor_id; + struct ida ns_ida; }; /* @@ -226,18 +227,35 @@ struct nvme_ns_ids { uuid_t uuid; }; +/* + * Anchor structure for namespaces. There is one for each namespace in a + * NVMe subsystem that any of our controllers can see, and the namespace + * structure for each controller is chained of it. For private namespaces + * there is a 1:1 relation to our namespace structures, that is ->list + * only ever has a single entry for private namespaces. + */ +struct nvme_ns_head { + struct list_head list; + struct srcu_struct srcu; + struct nvme_subsystem *subsys; + unsigned ns_id; + struct nvme_ns_ids ids; + struct list_head entry; + struct kref ref; + int instance; +}; + struct nvme_ns { struct list_head list; struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; + struct list_head siblings; struct nvm_dev *ndev; struct kref kref; - int instance; + struct nvme_ns_head *head; - unsigned ns_id; - struct nvme_ns_ids ids; int lba_shift; u16 ms; u16 sgs; -- cgit v1.2.3 From 32acab3181c7053c775ca128c3a5c6ce50197d7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 12:59:30 +0100 Subject: nvme: implement multipath access to nvme subsystems This patch adds native multipath support to the nvme driver. For each namespace we create only single block device node, which can be used to access that namespace through any of the controllers that refer to it. The gendisk for each controllers path to the name space still exists inside the kernel, but is hidden from userspace. The character device nodes are still available on a per-controller basis. A new link from the sysfs directory for the subsystem allows to find all controllers for a given subsystem. Currently we will always send I/O to the first available path, this will be changed once the NVMe Asynchronous Namespace Access (ANA) TP is ratified and implemented, at which point we will look at the ANA state for each namespace. Another possibility that was prototyped is to use the path that is closes to the submitting NUMA code, which will be mostly interesting for PCI, but might also be useful for RDMA or FC transports in the future. There is not plan to implement round robin or I/O service time path selectors, as those are not scalable with the performance rates provided by NVMe. The multipath device will go away once all paths to it disappear, any delay to keep it alive needs to be implemented at the controller level. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/Kconfig | 9 ++ drivers/nvme/host/Makefile | 1 + drivers/nvme/host/core.c | 147 +++++++++++++++++++++--- drivers/nvme/host/multipath.c | 255 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 57 ++++++++++ 5 files changed, 455 insertions(+), 14 deletions(-) create mode 100644 drivers/nvme/host/multipath.c diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 46d6cb1e03bd0..b979cf3bce65f 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -13,6 +13,15 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. +config NVME_MULTIPATH + bool "NVMe multipath support" + depends on NVME_CORE + ---help--- + This option enables support for multipath access to NVMe + subsystems. If this option is enabled only a single + /dev/nvmeXnY device will show up for each NVMe namespaces, + even if it is accessible through multiple controllers. + config NVME_FABRICS tristate diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cc0aacb4c8b47..b856f2f549cda 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o +nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 13676f6cd4f67..59f80a613fd85 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -185,17 +185,22 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; - if (blk_queue_dying(req->q)) - return false; return true; } void nvme_complete_rq(struct request *req) { if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); - return; + if (nvme_req_needs_failover(req)) { + nvme_failover_req(req); + return; + } + + if (!blk_queue_dying(req->q)) { + nvme_req(req)->retries++; + blk_mq_requeue_request(req, true); + return; + } } blk_mq_end_request(req, nvme_error_status(req)); @@ -286,7 +291,8 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ctrl->state = new_state; spin_unlock_irqrestore(&ctrl->lock, flags); - + if (changed && ctrl->state == NVME_CTRL_LIVE) + nvme_kick_requeue_lists(ctrl); return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -296,6 +302,7 @@ static void nvme_free_ns_head(struct kref *ref) struct nvme_ns_head *head = container_of(ref, struct nvme_ns_head, ref); + nvme_mpath_remove_disk(head); ida_simple_remove(&head->subsys->ns_ida, head->instance); list_del_init(&head->entry); cleanup_srcu_struct(&head->srcu); @@ -1138,11 +1145,33 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return status; } -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +/* + * Issue ioctl requests on the first available path. Note that unlike normal + * block layer requests we will not retry failed request on another controller. + */ +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx) { - struct nvme_ns *ns = bdev->bd_disk->private_data; +#ifdef CONFIG_NVME_MULTIPATH + if (disk->fops == &nvme_ns_head_ops) { + *head = disk->private_data; + *srcu_idx = srcu_read_lock(&(*head)->srcu); + return nvme_find_path(*head); + } +#endif + *head = NULL; + *srcu_idx = -1; + return disk->private_data; +} + +static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +{ + if (head) + srcu_read_unlock(&head->srcu, idx); +} +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) +{ switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); @@ -1165,10 +1194,31 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, } } +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + ret = -EWOULDBLOCK; + else + ret = nvme_ns_ioctl(ns, cmd, arg); + nvme_put_ns_from_disk(head, srcu_idx); + return ret; +} + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; +#ifdef CONFIG_NVME_MULTIPATH + /* should never be called due to GENHD_FL_HIDDEN */ + if (WARN_ON_ONCE(ns->head->disk)) + return -ENXIO; +#endif if (!kref_get_unless_zero(&ns->kref)) return -ENXIO; return 0; @@ -1329,6 +1379,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->noiob) nvme_set_chunk_size(ns); nvme_update_disk_info(disk, ns, id); +#ifdef CONFIG_NVME_MULTIPATH + if (ns->head->disk) + nvme_update_disk_info(ns->head->disk, ns, id); +#endif } static int nvme_revalidate_disk(struct gendisk *disk) @@ -1388,8 +1442,10 @@ static char nvme_pr_type(enum pr_type type) static int nvme_pr_command(struct block_device *bdev, u32 cdw10, u64 key, u64 sa_key, u8 op) { - struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; struct nvme_command c; + int srcu_idx, ret; u8 data[16] = { 0, }; put_unaligned_le64(key, &data[0]); @@ -1397,10 +1453,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->head->ns_id); + c.common.nsid = cpu_to_le32(head->ns_id); c.common.cdw10[0] = cpu_to_le32(cdw10); - return nvme_submit_sync_cmd(ns->queue, &c, data, 16); + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + ret = -EWOULDBLOCK; + else + ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); + nvme_put_ns_from_disk(head, srcu_idx); + return ret; } static int nvme_pr_register(struct block_device *bdev, u64 old, @@ -1490,6 +1552,32 @@ static const struct block_device_operations nvme_fops = { .pr_ops = &nvme_pr_ops, }; +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + + if (!kref_get_unless_zero(&head->ref)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_ioctl, + .getgeo = nvme_getgeo, + .pr_ops = &nvme_pr_ops, +}; +#endif /* CONFIG_NVME_MULTIPATH */ + static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) { unsigned long timeout = @@ -2592,6 +2680,10 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + ret = nvme_mpath_alloc_disk(ctrl, head); + if (ret) + goto out_cleanup_srcu; + list_add_tail(&head->entry, &ctrl->subsys->nsheads); return head; out_cleanup_srcu: @@ -2704,7 +2796,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev); + int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; bool new = true; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); @@ -2735,7 +2827,30 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_init_ns_head(ns, nsid, id, &new)) goto out_free_id; +#ifdef CONFIG_NVME_MULTIPATH + /* + * If multipathing is enabled we need to always use the subsystem + * instance number for numbering our devices to avoid conflicts + * between subsystems that have multiple controllers and thus use + * the multipath-aware subsystem node and those that have a single + * controller and use the controller node directly. + */ + if (ns->head->disk) { + sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, + ctrl->cntlid, ns->head->instance); + flags = GENHD_FL_HIDDEN; + } else { + sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, + ns->head->instance); + } +#else + /* + * But without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); +#endif if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { @@ -2751,7 +2866,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; + disk->flags = flags; memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; @@ -2773,6 +2888,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->ndev && nvme_nvm_register_sysfs(ns)) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); + + if (new) + nvme_mpath_add_disk(ns->head); return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -2805,6 +2923,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) } mutex_lock(&ns->ctrl->subsys->lock); + nvme_mpath_clear_current_path(ns); if (head) list_del_rcu(&ns->siblings); mutex_unlock(&ns->ctrl->subsys->lock); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c new file mode 100644 index 0000000000000..062754ebebfd6 --- /dev/null +++ b/drivers/nvme/host/multipath.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2017 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include "nvme.h" + +static bool multipath = true; +module_param(multipath, bool, 0644); +MODULE_PARM_DESC(multipath, + "turn on native support for multiple controllers per subsystem"); + +void nvme_failover_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long flags; + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + + nvme_reset_ctrl(ns->ctrl); + kblockd_schedule_work(&ns->head->requeue_work); +} + +bool nvme_req_needs_failover(struct request *req) +{ + if (!(req->cmd_flags & REQ_NVME_MPATH)) + return false; + + switch (nvme_req(req)->status & 0x7ff) { + /* + * Generic command status: + */ + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: + case NVME_SC_LBA_RANGE: + case NVME_SC_CAP_EXCEEDED: + case NVME_SC_RESERVATION_CONFLICT: + return false; + + /* + * I/O command set specific error. Unfortunately these values are + * reused for fabrics commands, but those should never get here. + */ + case NVME_SC_BAD_ATTRIBUTES: + case NVME_SC_INVALID_PI: + case NVME_SC_READ_ONLY: + case NVME_SC_ONCS_NOT_SUPPORTED: + WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == + nvme_fabrics_command); + return false; + + /* + * Media and Data Integrity Errors: + */ + case NVME_SC_WRITE_FAULT: + case NVME_SC_READ_ERROR: + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_COMPARE_FAILED: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_UNWRITTEN_BLOCK: + return false; + } + + /* Everything else could be a path failure, so should be retried */ + return true; +} + +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->disk) + kblockd_schedule_work(&ns->head->requeue_work); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (ns->ctrl->state == NVME_CTRL_LIVE) { + rcu_assign_pointer(head->current_path, ns); + return ns; + } + } + + return NULL; +} + +inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); + + if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + ns = __nvme_find_path(head); + return ns; +} + +static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, + struct bio *bio) +{ + struct nvme_ns_head *head = q->queuedata; + struct device *dev = disk_to_dev(head->disk); + struct nvme_ns *ns; + blk_qc_t ret = BLK_QC_T_NONE; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (likely(ns)) { + bio->bi_disk = ns->disk; + bio->bi_opf |= REQ_NVME_MPATH; + ret = direct_make_request(bio); + } else if (!list_empty_careful(&head->list)) { + dev_warn_ratelimited(dev, "no path available - requeing I/O\n"); + + spin_lock_irq(&head->requeue_lock); + bio_list_add(&head->requeue_list, bio); + spin_unlock_irq(&head->requeue_lock); + } else { + dev_warn_ratelimited(dev, "no path - failing I/O\n"); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } + + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) +{ + struct nvme_ns_head *head = q->queuedata; + struct nvme_ns *ns; + bool found = false; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = srcu_dereference(head->current_path, &head->srcu); + if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + found = ns->queue->poll_fn(q, qc); + srcu_read_unlock(&head->srcu, srcu_idx); + return found; +} + +static void nvme_requeue_work(struct work_struct *work) +{ + struct nvme_ns_head *head = + container_of(work, struct nvme_ns_head, requeue_work); + struct bio *bio, *next; + + spin_lock_irq(&head->requeue_lock); + next = bio_list_get(&head->requeue_list); + spin_unlock_irq(&head->requeue_lock); + + while ((bio = next) != NULL) { + next = bio->bi_next; + bio->bi_next = NULL; + + /* + * Reset disk to the mpath node and resubmit to select a new + * path. + */ + bio->bi_disk = head->disk; + generic_make_request(bio); + } +} + +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) +{ + struct request_queue *q; + bool vwc = false; + + bio_list_init(&head->requeue_list); + spin_lock_init(&head->requeue_lock); + INIT_WORK(&head->requeue_work, nvme_requeue_work); + + /* + * Add a multipath node if the subsystems supports multiple controllers. + * We also do this for private namespaces as the namespace sharing data could + * change after a rescan. + */ + if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) + return 0; + + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + if (!q) + goto out; + q->queuedata = head; + blk_queue_make_request(q, nvme_ns_head_make_request); + q->poll_fn = nvme_ns_head_poll; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* set to a default value for 512 until disk is validated */ + blk_queue_logical_block_size(q, 512); + + /* we need to propagate up the VMC settings */ + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); + + head->disk = alloc_disk(0); + if (!head->disk) + goto out_cleanup_queue; + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; + head->disk->queue = q; + head->disk->flags = GENHD_FL_EXT_DEVT; + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + return 0; + +out_cleanup_queue: + blk_cleanup_queue(q); +out: + return -ENOMEM; +} + +void nvme_mpath_add_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + device_add_disk(&head->subsys->dev, head->disk); +} + +void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + del_gendisk(head->disk); + blk_set_queue_dying(head->disk->queue); + /* make sure all pending bios are cleaned up */ + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); + blk_cleanup_queue(head->disk->queue); + put_disk(head->disk); +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6e5004b009753..dd6ced664b459 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -95,6 +95,11 @@ struct nvme_request { u16 status; }; +/* + * Mark a bio as coming in through the mpath node. + */ +#define REQ_NVME_MPATH REQ_DRV + enum { NVME_REQ_CANCELLED = (1 << 0), }; @@ -235,6 +240,13 @@ struct nvme_ns_ids { * only ever has a single entry for private namespaces. */ struct nvme_ns_head { +#ifdef CONFIG_NVME_MULTIPATH + struct gendisk *disk; + struct nvme_ns __rcu *current_path; + struct bio_list requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; +#endif struct list_head list; struct srcu_struct srcu; struct nvme_subsystem *subsys; @@ -384,6 +396,51 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); +extern const struct block_device_operations nvme_ns_head_ops; + +#ifdef CONFIG_NVME_MULTIPATH +void nvme_failover_req(struct request *req); +bool nvme_req_needs_failover(struct request *req); +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); + +static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + + if (head && ns == srcu_dereference(head->current_path, &head->srcu)) + rcu_assign_pointer(head->current_path, NULL); +} +struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); +#else +static inline void nvme_failover_req(struct request *req) +{ +} +static inline bool nvme_req_needs_failover(struct request *req) +{ + return false; +} +static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head) +{ + return 0; +} +static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_NVME_MULTIPATH */ + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); -- cgit v1.2.3 From 5b85b826b8925b3f1910b6613eb82c4df240a5b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:51:03 +0100 Subject: nvme: also expose the namespace identification sysfs files for mpath nodes We do this by adding a helper that returns the ns_head for a device that can belong to either the per-controller or per-subsystem block device nodes, and otherwise reuse all the existing code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 57 +++++++++++++++++++++++-------------------- drivers/nvme/host/multipath.c | 6 +++++ drivers/nvme/host/nvme.h | 1 + 3 files changed, 38 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 59f80a613fd85..438d3fa126084 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2382,12 +2382,22 @@ static ssize_t nvme_sysfs_rescan(struct device *dev, } static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); +static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops == &nvme_fops) + return nvme_get_ns_from_dev(dev)->head; + else + return disk->private_data; +} + static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; - struct nvme_subsystem *subsys = ns->ctrl->subsys; + struct nvme_ns_head *head = dev_to_ns_head(dev); + struct nvme_ns_ids *ids = &head->ids; + struct nvme_subsystem *subsys = head->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); @@ -2409,23 +2419,21 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, - ns->head->ns_id); + head->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->head->ids.nguid); + return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set @@ -2440,22 +2448,20 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->head->ids.eui64); + return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%d\n", ns->head->ns_id); + return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); -static struct attribute *nvme_ns_attrs[] = { +static struct attribute *nvme_ns_id_attrs[] = { &dev_attr_wwid.attr, &dev_attr_uuid.attr, &dev_attr_nguid.attr, @@ -2464,12 +2470,11 @@ static struct attribute *nvme_ns_attrs[] = { NULL, }; -static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, +static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; if (a == &dev_attr_uuid.attr) { if (uuid_is_null(&ids->uuid) || @@ -2487,9 +2492,9 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_ns_attr_group = { - .attrs = nvme_ns_attrs, - .is_visible = nvme_ns_attrs_are_visible, +const struct attribute_group nvme_ns_id_attr_group = { + .attrs = nvme_ns_id_attrs, + .is_visible = nvme_ns_id_attrs_are_visible, }; #define nvme_show_str_function(field) \ @@ -2882,7 +2887,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_attr_group)) + &nvme_ns_id_attr_group)) pr_warn("%s: failed to create sysfs group for identification\n", ns->disk->disk_name); if (ns->ndev && nvme_nvm_register_sysfs(ns)) @@ -2915,7 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_attr_group); + &nvme_ns_id_attr_group); if (ns->ndev) nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 062754ebebfd6..850275896e498 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -239,12 +239,18 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) if (!head->disk) return; device_add_disk(&head->subsys->dev, head->disk); + if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + head->disk->disk_name); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); del_gendisk(head->disk); blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index dd6ced664b459..895a4330dda00 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -396,6 +396,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); +extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH -- cgit v1.2.3 From 17eac0996341cfd28b4093554e662fd82aa1c237 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 9 Nov 2017 17:57:06 +0100 Subject: block: create 'slaves' and 'holders' entries for hidden gendisks When creating nvme multipath devices we should populate the 'slaves' and 'holders' directorys properly to aid userspace topology detection. Signed-off-by: Hannes Reinecke [hch: split from a larger patch] Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 835e907e6e015..3de1671631bfd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,14 +585,14 @@ static void register_disk(struct device *parent, struct gendisk *disk) */ pm_runtime_set_memalloc_noio(ddev, true); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (disk->flags & GENHD_FL_HIDDEN) { dev_set_uevent_suppress(ddev, 0); return; } - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - /* No minors to use for partitions */ if (!disk_part_scan_enabled(disk)) goto exit; @@ -728,11 +728,11 @@ void del_gendisk(struct gendisk *disk) WARN_ON(1); } - if (!(disk->flags & GENHD_FL_HIDDEN)) { + if (!(disk->flags & GENHD_FL_HIDDEN)) blk_unregister_region(disk_devt(disk), disk->minors); - kobject_put(disk->part0.holder_dir); - kobject_put(disk->slave_dir); - } + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; -- cgit v1.2.3 From e9a48034d7d1318ece7d4a235838a86c94db9d68 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 9 Nov 2017 17:57:06 +0100 Subject: nvme: create 'slaves' and 'holders' entries for hidden controllers When creating nvme multipath devices we should populate the 'slaves' and 'holders' directorys properly to aid userspace topology detection. Signed-off-by: Hannes Reinecke [hch: split from a larger patch, compile fix for CONFIG_NVME_MULTIPATH=n] Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 8 ++++++++ 3 files changed, 40 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 438d3fa126084..c3182aa654fd6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2896,6 +2896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (new) nvme_mpath_add_disk(ns->head); + nvme_mpath_add_disk_links(ns); return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -2919,6 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->disk && ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); + nvme_mpath_remove_disk_links(ns); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_id_attr_group); if (ns->ndev) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 850275896e498..78d92151a9042 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -245,6 +245,25 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) head->disk->disk_name); } +void nvme_mpath_add_disk_links(struct nvme_ns *ns) +{ + struct kobject *slave_disk_kobj, *holder_disk_kobj; + + if (!ns->head->disk) + return; + + slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; + if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, + kobject_name(slave_disk_kobj))) + return; + + holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; + if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, + kobject_name(holder_disk_kobj))) + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(slave_disk_kobj)); +} + void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) @@ -259,3 +278,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +void nvme_mpath_remove_disk_links(struct nvme_ns *ns) +{ + if (!ns->head->disk) + return; + + sysfs_remove_link(ns->disk->part0.holder_dir, + kobject_name(&disk_to_dev(ns->head->disk)->kobj)); + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(&disk_to_dev(ns->disk)->kobj)); +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 895a4330dda00..c0873a68872fb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -405,7 +405,9 @@ bool nvme_req_needs_failover(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_add_disk_links(struct nvme_ns *ns); void nvme_mpath_remove_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk_links(struct nvme_ns *ns); static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { @@ -437,6 +439,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } +static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns) +{ +} static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { } -- cgit v1.2.3 From 1e496938b6aec67b7cfd06614164fee0fe9f13e3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 10 Nov 2017 10:58:23 +0100 Subject: nvme: expose subsys attribute to sysfs We should be exposing the subsystem attributes like 'model' and 'subsysnqn' to sysfs to allow for easier identification of the subsystem. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c3182aa654fd6..7b3bbc1a9ac49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1999,6 +1999,53 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) return NULL; } +#define SUBSYS_ATTR_RO(_name, _mode, _show) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, NULL) + +static ssize_t nvme_subsys_show_nqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); +} +static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); + +#define nvme_subsys_show_str_function(field) \ +static ssize_t subsys_##field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_subsystem *subsys = \ + container_of(dev, struct nvme_subsystem, dev); \ + return sprintf(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ +} \ +static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); + +nvme_subsys_show_str_function(model); +nvme_subsys_show_str_function(serial); +nvme_subsys_show_str_function(firmware_rev); + +static struct attribute *nvme_subsys_attrs[] = { + &subsys_attr_model.attr, + &subsys_attr_serial.attr, + &subsys_attr_firmware_rev.attr, + &subsys_attr_subsysnqn.attr, + NULL, +}; + +static struct attribute_group nvme_subsys_attrs_group = { + .attrs = nvme_subsys_attrs, +}; + +static const struct attribute_group *nvme_subsys_attrs_groups[] = { + &nvme_subsys_attrs_group, + NULL, +}; + static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { struct nvme_subsystem *subsys, *found; @@ -2026,6 +2073,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; + subsys->dev.groups = nvme_subsys_attrs_groups; dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); device_initialize(&subsys->dev); -- cgit v1.2.3 From 0e78eccc552b6a9e9ce158aee72e95ae41e0e171 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Nov 2017 09:17:50 +0100 Subject: xtensa/simdisk: fix compile error Fixes: d004a5e7d4dd ("block: remove __bio_kmap_atomic") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- arch/xtensa/platforms/iss/simdisk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index eacf1e433518b..1b6418407467d 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -116,7 +116,7 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) simdisk_transfer(dev, sector, len, buffer, bio_data_dir(bio) == WRITE); sector += len; - kunmap_atomic(buffer) + kunmap_atomic(buffer); } bio_endio(bio); -- cgit v1.2.3 From 67f2519fe2903c4041c0e94394d14d372fe51399 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Tue, 24 Oct 2017 11:21:48 -0600 Subject: fs: guard_bio_eod() needs to consider partitions guard_bio_eod() needs to look at the partition capacity, not just the capacity of the whole device, when determining if truncation is necessary. [ 60.268688] attempt to access beyond end of device [ 60.268690] unknown-block(9,1): rw=0, want=67103509, limit=67103506 [ 60.268693] buffer_io_error: 2 callbacks suppressed [ 60.268696] Buffer I/O error on dev md1p7, logical block 4524305, async page read Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Cc: stable@vger.kernel.org # v4.13 Reviewed-by: Christoph Hellwig Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- fs/buffer.c | 10 +++++++++- include/linux/genhd.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index bff571dc7bc3b..bcabb69e7462e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3015,8 +3015,16 @@ void guard_bio_eod(int op, struct bio *bio) sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); - maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 93aae3476f58b..ca10cc292187d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -243,6 +243,7 @@ static inline dev_t part_devt(struct hd_struct *part) return part_to_dev(part)->devt; } +extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); static inline void disk_put_part(struct hd_struct *part) -- cgit v1.2.3 From f0fba398fec65ff5a2e1bf8ae62718ec0450abaf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 6 Nov 2017 17:53:53 +0000 Subject: block: avoid null pointer dereference on null disk It is possible that the pointer disk can be null and hence we can get a null pointer deference when accessing disk->flags. Add a null pointer check to avoid the dereference. Detected by CoverityScan, CID#1461133 ("Explicit null dereferenced") Fixes: 8ddcd653257c ("block: introduce GENHD_FL_HIDDEN") Reviewed-by: Christoph Hellwig Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 3de1671631bfd..997e598f3b86e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -798,7 +798,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } - if (unlikely(disk->flags & GENHD_FL_HIDDEN)) { + if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { put_disk(disk); disk = NULL; } -- cgit v1.2.3 From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Nov 2017 09:13:21 -0700 Subject: blk-mq: only run the hardware queue if IO is pending Currently we are inconsistent in when we decide to run the queue. Using blk_mq_run_hw_queues() we check if the hctx has pending IO before running it, but we don't do that from the individual queue run function, blk_mq_run_hw_queue(). This results in a lot of extra and pointless queue runs, potentially, on flush requests and (much worse) on tag starvation situations. This is observable just looking at top output, with lots of kworkers active. For the !async runs, it just adds to the CPU overhead of blk-mq. Move the has-pending check into the run function instead of having callers do it. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 7 +------ block/blk-mq.c | 18 +++++++++++------- block/blk-mq.h | 2 -- include/linux/blk-mq.h | 2 +- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 6f4bdb8209f7d..c117bd8fd1f61 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -81,12 +81,7 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) } else clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) { - blk_mq_run_hw_queue(hctx, true); - return true; - } - - return false; + return blk_mq_run_hw_queue(hctx, true); } /* diff --git a/block/blk-mq.c b/block/blk-mq.c index bfe24a5b62a36..a2a4271f5ab87 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -61,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) /* * Check if any of the ctx's have pending work in this hardware queue */ -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map) || - !list_empty_careful(&hctx->dispatch) || + return !list_empty_careful(&hctx->dispatch) || + sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } @@ -1253,9 +1253,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - __blk_mq_delay_run_hw_queue(hctx, async, 0); + if (blk_mq_hctx_has_pending(hctx)) { + __blk_mq_delay_run_hw_queue(hctx, async, 0); + return true; + } + + return false; } EXPORT_SYMBOL(blk_mq_run_hw_queue); @@ -1265,8 +1270,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (!blk_mq_hctx_has_pending(hctx) || - blk_mq_hctx_stopped(hctx)) + if (blk_mq_hctx_stopped(hctx)) continue; blk_mq_run_hw_queue(hctx, async); diff --git a/block/blk-mq.h b/block/blk-mq.h index 99a19c5523e2b..dcf379a892dda 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -26,14 +26,12 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b326208277ee4..eb1e2cdffb317 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, -- cgit v1.2.3 From 15f7b41f70ddcca3b555bd0fdc7c8da7466b517e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 10 Nov 2017 12:29:34 -0500 Subject: brd: remove unused brd_mutex Remove unused mutex brd_mutex. It is unused since the commit ff26956875c2 ("brd: remove support for BLKFLSBUF"). Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2d7178f7754ed..c1cf87718c2e0 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -60,7 +60,6 @@ struct brd_device { /* * Look up and return a brd's page for a given sector. */ -static DEFINE_MUTEX(brd_mutex); static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { pgoff_t idx; -- cgit v1.2.3 From f906a6a0f42684715b552ccff9282b22cfe2b5a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Nov 2017 16:10:13 -0700 Subject: blk-mq: improve tag waiting setup for non-shared tags If we run out of driver tags, we currently treat shared and non-shared tags the same - both cases hook into the tag waitqueue. This is a bit more costly than it needs to be on unshared tags, since we have to both grab the hctx lock, and the waitqueue lock (and disable interrupts). For the non-shared case, we can simply mark the queue as needing a restart. Split blk_mq_dispatch_wait_add() to account for both cases, and rename it to blk_mq_mark_tag_wait() to better reflect what it does now. Without this patch, shared and non-shared performance is about the same with 4 fio thread hammering on a single null_blk device (~410K, at 75% sys). With the patch, the shared case is the same, but the non-shared tags case runs at 431K at 71% sys. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 81 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a2a4271f5ab87..3295859f419dd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1006,44 +1006,68 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, return 1; } -static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx, - struct request *rq) +/* + * Mark us waiting for a tag. For shared tags, this involves hooking us into + * the tag wakeups. For non-shared tags, we can simply mark us nedeing a + * restart. For both caes, take care to check the condition again after + * marking us as waiting. + */ +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, + struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - wait_queue_entry_t *wait = &this_hctx->dispatch_wait; + bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; + wait_queue_entry_t *wait; + bool ret; - if (!list_empty_careful(&wait->entry)) - return false; + if (!shared_tags) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + } else { + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; + } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + } /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ - if (!blk_mq_get_driver_tag(rq, hctx, false)) { + ret = blk_mq_get_driver_tag(rq, hctx, false); + + if (!shared_tags) { + /* + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return ret; + } else { + if (!ret) { + spin_unlock(&this_hctx->lock); + return false; + } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); spin_unlock(&this_hctx->lock); - return false; + return true; } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); - return true; } bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, @@ -1076,10 +1100,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_dispatch_wait_add(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(&hctx, rq)) { if (got_budget) blk_mq_put_dispatch_budget(hctx); - no_tag = true; + /* + * For non-shared tags, the RESTART check + * will suffice. + */ + if (hctx->flags & BLK_MQ_F_TAG_SHARED) + no_tag = true; break; } } -- cgit v1.2.3 From 8dc7a31fbce5e2dbbacd83d910da37105181b054 Mon Sep 17 00:00:00 2001 From: Hongxu Jia Date: Fri, 10 Nov 2017 15:59:17 +0800 Subject: ide: ide-atapi: fix compile error with defining macro DEBUG Compile ide-atapi failed with defining macro "DEBUG" ... |drivers/ide/ide-atapi.c:285:52: error: 'struct request' has no member named 'cmd'; did you mean 'csd'? | debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); ... Since we split the scsi_request out of struct request, it missed do the same thing on debug_log Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request") Signed-off-by: Hongxu Jia Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 14d1e7d9a1d6f..0e6bc631a1caf 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive) struct request *rq = drive->hwif->rq; unsigned long wait = 0; - debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); + debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]); /* * Some commands are *slow* and normally take a long time to complete. @@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - debug_log("[cmd %x]: check condition\n", rq->cmd[0]); + debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]); /* Retry operation */ ide_retry_pc(drive); @@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) ide_pad_transfer(drive, write, bcount); debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n", - rq->cmd[0], done, bcount, scsi_req(rq)->resid_len); + scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len); /* And set the interrupt handler again */ ide_set_handler(drive, ide_pc_intr, timeout); -- cgit v1.2.3 From ff821d271415f17f1a704600420e92ebb9bfb32c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Nov 2017 22:05:12 -0700 Subject: blk-mq: fixup some comment typos and lengths Various typos and/or spelling errors in comments. Fixes a few > 80 char lines as well. Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3295859f419dd..b600463791ec8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -707,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, /* * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertation from the workqueue. + * request head insertion from the workqueue. */ BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); @@ -1137,7 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE) { /* * If an I/O scheduler has been configured and we got a - * driver tag for the next request already, free it again. + * driver tag for the next request already, free it + * again. */ if (!list_empty(list)) { nxt = list_first_entry(list, struct request, queuelist); @@ -2299,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_lock(&set->tag_list_lock); - /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ - if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + /* + * Check to see if we're transitioning to shared (from 1 to 2 queues). + */ + if (!list_empty(&set->tag_list) && + !(set->flags & BLK_MQ_F_TAG_SHARED)) { set->flags |= BLK_MQ_F_TAG_SHARED; /* update existing queue */ blk_mq_update_tag_set_depth(set, true); @@ -2532,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q) /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe - * we should change hctx numa_node according to new topology (this - * involves free and re-allocate memory, worthy doing?) + * we should change hctx numa_node according to the new topology (this + * involves freeing and re-allocating memory, worth doing?) */ - blk_mq_map_swqueue(q); blk_mq_sysfs_register(q); -- cgit v1.2.3 From a04b5de5050ab8b891128eb2c47a0916fe8622e1 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 28 Sep 2017 21:33:23 +0200 Subject: nvme: fix visibility of "uuid" ns attribute "uuid" must be invisible if both ns->uuid and ns->nguid are unset, not if either one is. Fixes: d934f9848a77 "nvme: provide UUID value to userspace" Signed-off-by: Martin Wilck [hch: rebased to the nvme-4.15 tree to help resolving a conflict] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7b3bbc1a9ac49..993813ccdc0b6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2525,7 +2525,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ids->uuid) || + if (uuid_is_null(&ids->uuid) && !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } -- cgit v1.2.3