From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:04 -0700
Subject: kthread: add a mechanism to store cgroup info

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/kthread.h | 11 +++++++++
 kernel/kthread.c        | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197eeac91f..bd4369c83dfba 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/cgroup.h>
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf824272..b011ea08967fc 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
-#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
+#ifdef CONFIG_CGROUPS
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+	struct kthread *kthread;
+
 	/*
 	 * Can be NULL if this kthread was created by kernel_thread()
 	 * or if kmalloc() in kthread() failed.
 	 */
-	kfree(to_kthread(k));
+	kthread = to_kthread(k);
+#ifdef CONFIG_CGROUPS
+	WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+	kfree(kthread);
 }
 
 /**
@@ -216,6 +224,9 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
+#ifdef CONFIG_CGROUPS
+	self->blkcg_css = NULL;
+#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 	kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+	struct kthread *kthread;
+
+	if (!(current->flags & PF_KTHREAD))
+		return;
+	kthread = to_kthread(current);
+	if (!kthread)
+		return;
+
+	if (kthread->blkcg_css) {
+		css_put(kthread->blkcg_css);
+		kthread->blkcg_css = NULL;
+	}
+	if (css) {
+		css_get(css);
+		kthread->blkcg_css = css;
+	}
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	struct kthread *kthread;
+
+	if (current->flags & PF_KTHREAD) {
+		kthread = to_kthread(current);
+		if (kthread)
+			return kthread->blkcg_css;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
cgit v1.2.3


From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:05 -0700
Subject: blkcg: delete unused APIs

Nobody uses the APIs right now.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 31 -------------------------------
 include/linux/bio.h        |  2 --
 include/linux/blk-cgroup.h | 12 ------------
 3 files changed, 45 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b38e962fa83e7..8338304ea2563 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2032,37 +2032,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-	struct io_context *ioc;
-
-	if (bio->bi_css)
-		return -EBUSY;
-
-	ioc = current->io_context;
-	if (!ioc)
-		return -ENOENT;
-
-	get_io_context_active(ioc);
-	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, io_cgrp_id);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
 /**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 275c91c995163..9c75f58f6a501 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -522,13 +522,11 @@ do {						\
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
-int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
-static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153dd856f..0cfa8d2ef4d63 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return task_blkcg(current);
 }
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-	return task_get_css(task, io_cgrp_id);
-}
-
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -735,12 +729,6 @@ struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-	return NULL;
-}
-
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-- 
cgit v1.2.3


From 902ec5b6de0678ac2effba0faaafdd1c3e7fcf2e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:06 -0700
Subject: block: make blkcg aware of kthread stored original cgroup info

bio_blkcg is the only API to get cgroup info for a bio right now. If
bio_blkcg finds current task is a kthread and has original blkcg
associated, it will use the css instead of associating the bio to
current task. This makes it possible that kthread dispatches bios on
behalf of other threads.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0cfa8d2ef4d63..f57e54d645297 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
+#include <linux/kthread.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-	return css_to_blkcg(task_css(tsk, io_cgrp_id));
-}
-
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
+	struct cgroup_subsys_state *css;
+
 	if (bio && bio->bi_css)
 		return css_to_blkcg(bio->bi_css);
-	return task_blkcg(current);
+	css = kthread_blkcg();
+	if (css)
+		return css_to_blkcg(css);
+	return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
 /**
-- 
cgit v1.2.3


From d4478e92d6186ce37947a36994de407c27446266 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 25 Sep 2017 13:07:22 -0600
Subject: block/loop: make loop cgroup aware

loop block device handles IO in a separate thread. The actual IO
dispatched isn't cloned from the IO loop device received, so the
dispatched IO loses the cgroup context.

I'm ignoring buffer IO case now, which is quite complicated.  Making the
loop thread aware cgroup context doesn't really help. The loop device
only writes to a single file. In current writeback cgroup
implementation, the file can only belong to one cgroup.

For direct IO case, we could workaround the issue in theory. For
example, say we assign cgroup1 5M/s BW for loop device and cgroup2
10M/s. We can create a special cgroup for loop thread and assign at
least 15M/s for the underlayer disk. In this way, we correctly throttle
the two cgroups. But this is tricky to setup.

This patch tries to address the issue. We record bio's css in loop
command. When loop thread is handling the command, we then use the API
provided in patch 1 to set the css for current task. The bio layer will
use the css for new IO (from patch 3).

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 13 +++++++++++++
 drivers/block/loop.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 85de673346955..fd4eff5f5b76a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+	if (cmd->css)
+		css_put(cmd->css);
 	cmd->ret = ret;
 	lo_rw_aio_do_completion(cmd);
 }
@@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_filp = file;
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
+	if (cmd->css)
+		kthread_associate_blkcg(cmd->css);
 
 	if (rw == WRITE)
 		ret = call_write_iter(file, &cmd->iocb, &iter);
@@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
 	lo_rw_aio_do_completion(cmd);
+	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
 		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
@@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
+	/* always use the first bio's css */
+#ifdef CONFIG_CGROUPS
+	if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
+		cmd->css = cmd->rq->bio->bi_css;
+		css_get(cmd->css);
+	} else
+#endif
+		cmd->css = NULL;
 	kthread_queue_work(&lo->worker, &cmd->work);
 
 	return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 1f39567029938..0f45416e4fcfe 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -72,6 +72,7 @@ struct loop_cmd {
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
+	struct cgroup_subsys_state *css;
 };
 
 /* Support for loadable transfer modules */
-- 
cgit v1.2.3


From 9979d545c93653770984806e9bfcec1df53a9d3d Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Mon, 25 Sep 2017 19:10:07 +0200
Subject: block: cryptoloop - Fix build warning

This patch fix the following build warning:
drivers/block/cryptoloop.c:46:8: warning: variable 'cipher' set but not used [-Wunused-but-set-variable]

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/cryptoloop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 74e03aa537ad7..7033a4beda669 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
 	int cipher_len;
 	int mode_len;
 	char cms[LO_NAME_SIZE];			/* cipher-mode string */
-	char *cipher;
 	char *mode;
 	char *cmsp = cms;			/* c-m string pointer */
 	struct crypto_skcipher *tfm;
@@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
 	strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
 	cms[LO_NAME_SIZE - 1] = 0;
 
-	cipher = cmsp;
 	cipher_len = strcspn(cmsp, "-");
 
 	mode = cmsp + cipher_len;
-- 
cgit v1.2.3


From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 26 Sep 2017 11:02:12 -0700
Subject: block: fix a build error

The code is only for blkcg not for all cgroups

Fixes: d4478e92d618 ("block/loop: make loop cgroup aware")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c    | 2 +-
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fd4eff5f5b76a..bc8e61506968a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
 		cmd->css = cmd->rq->bio->bi_css;
 		css_get(cmd->css);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index bd4369c83dfba..fb201842c635d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
 #else
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b011ea08967fc..f87cd8b4eb2a1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,7 +46,7 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	struct cgroup_subsys_state *blkcg_css;
 #endif
 };
@@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k)
 	 * or if kmalloc() in kthread() failed.
 	 */
 	kthread = to_kthread(k);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	WARN_ON_ONCE(kthread && kthread->blkcg_css);
 #endif
 	kfree(kthread);
@@ -224,7 +224,7 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	self->blkcg_css = NULL;
 #endif
 	current->vfork_done = &self->exited;
@@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 /**
  * kthread_associate_blkcg - associate blkcg to current kthread
  * @css: the cgroup info
-- 
cgit v1.2.3


From b3cffc3877a6c7edf17fd9c476f3e8d930bccec1 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Sat, 30 Sep 2017 09:49:21 +0800
Subject: null_blk: add "no_sched" module parameter

add an option that disable io scheduler for null block device.

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 8042c26ea9e6e..bf2c8ca3242aa 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -154,6 +154,10 @@ enum {
 	NULL_Q_MQ		= 2,
 };
 
+static int g_no_sched;
+module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+MODULE_PARM_DESC(no_sched, "No io scheduler");
+
 static int g_submit_queues = 1;
 module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
@@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
 	set->numa_node = nullb ? nullb->dev->home_node : g_home_node;
 	set->cmd_size	= sizeof(struct nullb_cmd);
 	set->flags = BLK_MQ_F_SHOULD_MERGE;
+	if (g_no_sched)
+		set->flags |= BLK_MQ_F_NO_SCHED;
 	set->driver_data = NULL;
 
 	if ((nullb && nullb->dev->blocking) || g_blocking)
-- 
cgit v1.2.3


From 547248736ae54b73e713ee259a81ab3a637cb2f7 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Sat, 30 Sep 2017 15:01:48 +0800
Subject: blk-mq: remove unused function hctx_allow_merges

since 9bddeb2a5b981 "blk-mq: make per-sw-queue bio merge as default .bio_merge"
there is no caller for this function.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e9..59687ed6561b9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1504,12 +1504,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 	blk_account_io_start(rq, true);
 }
 
-static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
-{
-	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-		!blk_queue_nomerges(hctx->queue);
-}
-
 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
 				   struct blk_mq_ctx *ctx,
 				   struct request *rq)
-- 
cgit v1.2.3


From 5385fa47d883dc7d51de622835427bc2558ed43c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 1 Oct 2017 01:26:21 -0600
Subject: blk-mq-tag: kill unused tag enums

We don't have any notion of a tagging cache anymore, and haven't
for a long time. Kill off the unused enums.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 5cb51e53cc035..5932a7ac7fc4e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -43,14 +43,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 	return sbq_wait_ptr(bt, &hctx->wait_index);
 }
 
-enum {
-	BLK_MQ_TAG_CACHE_MIN	= 1,
-	BLK_MQ_TAG_CACHE_MAX	= 64,
-};
-
 enum {
 	BLK_MQ_TAG_FAIL		= -1U,
-	BLK_MQ_TAG_MIN		= BLK_MQ_TAG_CACHE_MIN,
+	BLK_MQ_TAG_MIN		= 1,
 	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
 };
 
-- 
cgit v1.2.3


From 7beb2f845b715cb98584cf630e9a9d5b05501166 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 02:08:24 -0600
Subject: blk-mq: wire up completion notifier for laptop mode

For some reason, the laptop mode IO completion notified was never wired
up for blk-mq. Ensure that we trigger the callback appropriately, to arm
the laptop mode flush timer.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 59687ed6561b9..7f01d69879d66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -476,6 +476,9 @@ void blk_mq_free_request(struct request *rq)
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
 
+	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
+		laptop_io_completion(q->backing_dev_info);
+
 	wbt_done(q->rq_wb, &rq->issue_stat);
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-- 
cgit v1.2.3


From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Sep 2017 05:40:16 -0600
Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL

Instead of adding weird retry logic in that function, utilize
__GFP_NOFAIL to ensure that the vm takes care of handling any
potential retries appropriately. This means we don't have to
call free_more_memory() from here.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bitmap.c         |  2 +-
 fs/buffer.c                 | 33 ++++++++++-----------------------
 fs/ntfs/aops.c              |  2 +-
 fs/ntfs/mft.c               |  2 +-
 include/linux/buffer_head.h |  2 +-
 5 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d2121637b4abc..4d8ed74efadf8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index,
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);
 
-	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
+	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, false);
 	if (!bh) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb99..1234ae343aef0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -861,16 +861,19 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry)
+		bool retry)
 {
 	struct buffer_head *bh, *head;
+	gfp_t gfp = GFP_NOFS;
 	long offset;
 
-try_again:
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = alloc_buffer_head(GFP_NOFS);
+		bh = alloc_buffer_head(gfp);
 		if (!bh)
 			goto no_grow;
 
@@ -896,23 +899,7 @@ no_grow:
 		} while (head);
 	}
 
-	/*
-	 * Return failure for non-async IO requests.  Async IO requests
-	 * are not allowed to fail, so we have to wait until buffer heads
-	 * become available.  But we don't want tasks sleeping with 
-	 * partially complete buffers, so all were released above.
-	 */
-	if (!retry)
-		return NULL;
-
-	/* We're _really_ low on memory. Now we just
-	 * wait for old buffer heads to become free due to
-	 * finishing IO.  Since this is an async request and
-	 * the reserve list is empty, we're sure there are 
-	 * async buffer heads in use.
-	 */
-	free_more_memory();
-	goto try_again;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
@@ -1021,7 +1008,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	/*
 	 * Allocate some buffers for this page
 	 */
-	bh = alloc_page_buffers(page, size, 0);
+	bh = alloc_page_buffers(page, size, false);
 	if (!bh)
 		goto failed;
 
@@ -1575,7 +1562,7 @@ void create_empty_buffers(struct page *page,
 {
 	struct buffer_head *bh, *head, *tail;
 
-	head = alloc_page_buffers(page, blocksize, 1);
+	head = alloc_page_buffers(page, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -2638,7 +2625,7 @@ int nobh_write_begin(struct address_space *mapping,
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
-	head = alloc_page_buffers(page, blocksize, 0);
+	head = alloc_page_buffers(page, blocksize, false);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cc91856b5e2d9..3a2e509c77c57 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, 1);
+		bh = head = alloc_page_buffers(page, bh_size, true);
 		spin_lock(&mapping->private_lock);
 		if (likely(!page_has_buffers(page))) {
 			struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b6f402194f02c..ee8392aee9f65 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	if (unlikely(!page_has_buffers(page))) {
 		struct buffer_head *tail;
 
-		bh = head = alloc_page_buffers(page, blocksize, 1);
+		bh = head = alloc_page_buffers(page, blocksize, true);
 		do {
 			set_buffer_uptodate(bh);
 			tail = bh;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae555eccf9..ae2d25f01b98e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -156,7 +156,7 @@ void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
 int try_to_free_buffers(struct page *);
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry);
+		bool retry);
 void create_empty_buffers(struct page *, unsigned long,
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
-- 
cgit v1.2.3


From 94dc24c0c59a224a093f110060d01c2c620f275a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Sep 2017 05:45:36 -0600
Subject: buffer: grow_dev_page() should use __GFP_NOFAIL for all cases

We currently use it for find_or_create_page(), which means that it
cannot fail. Ensure we also pass in 'retry == true' to
alloc_page_buffers(), which also ensure that it cannot fail.

After this, there are no failure cases in grow_dev_page() that
occur because of a failed memory allocation.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 1234ae343aef0..3b60cd8456db4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -988,8 +988,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	gfp_mask |= __GFP_NOFAIL;
 
 	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
-	if (!page)
-		return ret;
 
 	BUG_ON(!PageLocked(page));
 
@@ -1008,9 +1006,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	/*
 	 * Allocate some buffers for this page
 	 */
-	bh = alloc_page_buffers(page, size, false);
-	if (!bh)
-		goto failed;
+	bh = alloc_page_buffers(page, size, true);
 
 	/*
 	 * Link the page to the buffers and initialise them.  Take the
-- 
cgit v1.2.3


From bc48f001de12225b6430a243504aa60b5ae8a91a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Sep 2017 06:08:57 -0600
Subject: buffer: eliminate the need to call free_more_memory() in
 __getblk_slow()

Since the previous commit removed any case where grow_buffers()
would return failure due to memory allocations, we can safely
remove the case where we have to call free_more_memory() in
this function.

Since this is also the last user of free_more_memory(), kill
it off completely.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 3b60cd8456db4..bff571dc7bc3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -252,27 +252,6 @@ out:
 	return ret;
 }
 
-/*
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
- */
-static void free_more_memory(void)
-{
-	struct zoneref *z;
-	int nid;
-
-	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
-	yield();
-
-	for_each_online_node(nid) {
-
-		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL);
-		if (z->zone)
-			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS, NULL);
-	}
-}
-
 /*
  * I/O completion handler for block_read_full_page() - pages
  * which come unlocked at the end of I/O.
@@ -1086,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
 		ret = grow_buffers(bdev, block, size, gfp);
 		if (ret < 0)
 			return NULL;
-		if (ret == 0)
-			free_more_memory();
 	}
 }
 
-- 
cgit v1.2.3


From 9ba4b2dfafaa711b41cc2102b0e9a529f3981218 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Sep 2017 08:58:25 -0600
Subject: fs: kill 'nr_pages' argument from wakeup_flusher_threads()

Everybody is passing in 0 now, let's get rid of the argument.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 9 ++++-----
 fs/sync.c                 | 2 +-
 include/linux/writeback.h | 2 +-
 mm/vmscan.c               | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 245c430a2e418..bb6148dc6d245 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1947,12 +1947,12 @@ void wb_workfn(struct work_struct *work)
 }
 
 /*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
+ * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+void wakeup_flusher_threads(enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
+	long nr_pages;
 
 	/*
 	 * If we are expecting writeback progress we must submit plugged IO.
@@ -1960,8 +1960,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	if (blk_needs_flush_plug(current))
 		blk_schedule_flush_plug(current);
 
-	if (!nr_pages)
-		nr_pages = get_nr_dirty_pages();
+	nr_pages = get_nr_dirty_pages();
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
diff --git a/fs/sync.c b/fs/sync.c
index a576aa2e6b090..09f96a18dd930 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -108,7 +108,7 @@ SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
 
-	wakeup_flusher_threads(0, WB_REASON_SYNC);
+	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d5815794416c9..1f9c6db5e29a0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,7 +189,7 @@ bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+void wakeup_flusher_threads(enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 13d711dd87766..42a7fdd52d877 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1867,7 +1867,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		 * also allow kswapd to start writing pages during reclaim.
 		 */
 		if (stat.nr_unqueued_dirty == nr_taken) {
-			wakeup_flusher_threads(0, WB_REASON_VMSCAN);
+			wakeup_flusher_threads(WB_REASON_VMSCAN);
 			set_bit(PGDAT_DIRTY, &pgdat->flags);
 		}
 
-- 
cgit v1.2.3


From d31cd9d326f216fca9a602dddac254f668f5dcd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 27 Sep 2017 13:28:18 -0600
Subject: writeback: switch wakeup_flusher_threads() to cyclic writeback

We're writing back the full range of dirty pages on the devices,
there's no point in making this special and not do normal range
cyclic writeback.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bb6148dc6d245..65e6992d8719a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1971,7 +1971,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
 
 		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-					   false, reason);
+						true, reason);
 	}
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From 47410d88f665486bf91f02242ab5d5692b8887ac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:25:03 -0600
Subject: writeback: remove 'range_cyclic' argument for wb_start_writeback()

All the callers pass in 'true' for range_cyclic, so kill the
argument.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c           | 6 +++---
 include/linux/backing-dev.h | 2 +-
 mm/page-writeback.c         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 65e6992d8719a..fe555bce886c1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -934,7 +934,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			bool range_cyclic, enum wb_reason reason)
+			enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -955,7 +955,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
-	work->range_cyclic = range_cyclic;
+	work->range_cyclic = 1;
 	work->reason	= reason;
 	work->auto_free	= 1;
 
@@ -1971,7 +1971,7 @@ void wakeup_flusher_threads(enum wb_reason reason)
 
 		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-						true, reason);
+						reason);
 	}
 	rcu_read_unlock();
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 854e1bdd0b2a4..0f63493de9e7d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -39,7 +39,7 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 }
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			bool range_cyclic, enum wb_reason reason);
+			enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b9c5cbe8eba0..dede533551232 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1994,8 +1994,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	rcu_read_lock();
 	list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
 		if (wb_has_dirty_io(wb))
-			wb_start_writeback(wb, nr_pages, true,
-					   WB_REASON_LAPTOP_TIMER);
+			wb_start_writeback(wb, nr_pages,
+						WB_REASON_LAPTOP_TIMER);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 595043e5f9ef1d8263bd9fda215cade489227491 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:26:59 -0600
Subject: writeback: provide a wakeup_flusher_threads_bdi()

Similar to wakeup_flusher_threads(), except that we only wake
up the flusher threads on the specified backing device.

No functional changes in this patch.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 39 +++++++++++++++++++++++++++++----------
 include/linux/writeback.h |  2 ++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fe555bce886c1..9f39829459dc0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1946,6 +1946,33 @@ void wb_workfn(struct work_struct *work)
 	current->flags &= ~PF_SWAPWRITE;
 }
 
+/*
+ * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
+ * write back the whole world.
+ */
+static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+					 long nr_pages, enum wb_reason reason)
+{
+	struct bdi_writeback *wb;
+
+	if (!bdi_has_dirty_io(bdi))
+		return;
+
+	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+		wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+					reason);
+}
+
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+				enum wb_reason reason)
+{
+	long nr_pages = get_nr_dirty_pages();
+
+	rcu_read_lock();
+	__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
+	rcu_read_unlock();
+}
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
@@ -1963,16 +1990,8 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	nr_pages = get_nr_dirty_pages();
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-		struct bdi_writeback *wb;
-
-		if (!bdi_has_dirty_io(bdi))
-			continue;
-
-		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-						reason);
-	}
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+		__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
 	rcu_read_unlock();
 }
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1f9c6db5e29a0..9c0091678af48 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -190,6 +190,8 @@ bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(enum wb_reason reason);
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+				enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
-- 
cgit v1.2.3


From 0ab29fd0accf6e5b6ef5dbe2e0335da6687ce60c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:28:02 -0600
Subject: writeback: pass in '0' for nr_pages writeback in laptop mode

Laptop mode really wants to writeback the number of dirty
pages and inodes. Instead of calculating this in the caller,
just pass in 0 and let wakeup_flusher_threads() handle it.

Use the new wakeup_flusher_threads_bdi() instead of rolling
our own.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/page-writeback.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dede533551232..8d1fc593bce8f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1980,23 +1980,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 void laptop_mode_timer_fn(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
-	int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
-		global_node_page_state(NR_UNSTABLE_NFS);
-	struct bdi_writeback *wb;
 
-	/*
-	 * We want to write everything out, not just down to the dirty
-	 * threshold
-	 */
-	if (!bdi_has_dirty_io(q->backing_dev_info))
-		return;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
-		if (wb_has_dirty_io(wb))
-			wb_start_writeback(wb, nr_pages,
-						WB_REASON_LAPTOP_TIMER);
-	rcu_read_unlock();
+	wakeup_flusher_threads_bdi(q->backing_dev_info, WB_REASON_LAPTOP_TIMER);
 }
 
 /*
-- 
cgit v1.2.3


From 9dfb176fae57a1dea68531fd25e867037e4d9bac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:28:55 -0600
Subject: writeback: make wb_start_writeback() static

We don't have any callers outside of fs-writeback.c anymore,
make it private.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c           | 4 ++--
 include/linux/backing-dev.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f39829459dc0..f1042061eaad7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,8 +933,8 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			enum wb_reason reason)
+static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+				enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0f63493de9e7d..157e950a70dc1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,8 +38,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 	return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
 }
 
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			enum wb_reason reason);
 void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
-- 
cgit v1.2.3


From e8e8a0c6c9bfc0b320671166dd795f413f636773 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:31:22 -0600
Subject: writeback: move nr_pages == 0 logic to one location

Now that we have no external callers of wb_start_writeback(), we
can shuffle the passing in of 'nr_pages'. Everybody passes in 0
at this point, so just kill the argument and move the dirty
count retrieval to that function.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f1042061eaad7..424577152eb57 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,8 +933,18 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-				enum wb_reason reason)
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+	return global_node_page_state(NR_FILE_DIRTY) +
+		global_node_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
+
+static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -954,7 +964,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 	}
 
 	work->sync_mode	= WB_SYNC_NONE;
-	work->nr_pages	= nr_pages;
+	work->nr_pages	= wb_split_bdi_pages(wb, get_nr_dirty_pages());
 	work->range_cyclic = 1;
 	work->reason	= reason;
 	work->auto_free	= 1;
@@ -1814,17 +1824,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 	return work;
 }
 
-/*
- * Add in the number of potentially dirty inodes, because each inode
- * write can dirty pagecache in the underlying blockdev.
- */
-static unsigned long get_nr_dirty_pages(void)
-{
-	return global_node_page_state(NR_FILE_DIRTY) +
-		global_node_page_state(NR_UNSTABLE_NFS) +
-		get_nr_dirty_inodes();
-}
-
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
 	if (wb_over_bg_thresh(wb)) {
@@ -1951,7 +1950,7 @@ void wb_workfn(struct work_struct *work)
  * write back the whole world.
  */
 static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
-					 long nr_pages, enum wb_reason reason)
+					 enum wb_reason reason)
 {
 	struct bdi_writeback *wb;
 
@@ -1959,17 +1958,14 @@ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 		return;
 
 	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-		wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-					reason);
+		wb_start_writeback(wb, reason);
 }
 
 void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 				enum wb_reason reason)
 {
-	long nr_pages = get_nr_dirty_pages();
-
 	rcu_read_lock();
-	__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
+	__wakeup_flusher_threads_bdi(bdi, reason);
 	rcu_read_unlock();
 }
 
@@ -1979,7 +1975,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 void wakeup_flusher_threads(enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
-	long nr_pages;
 
 	/*
 	 * If we are expecting writeback progress we must submit plugged IO.
@@ -1987,11 +1982,9 @@ void wakeup_flusher_threads(enum wb_reason reason)
 	if (blk_needs_flush_plug(current))
 		blk_schedule_flush_plug(current);
 
-	nr_pages = get_nr_dirty_pages();
-
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
-		__wakeup_flusher_threads_bdi(bdi, nr_pages, reason);
+		__wakeup_flusher_threads_bdi(bdi, reason);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From aac8d41cd438f25bf3110fc6b98f1d16d7dbc169 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 28 Sep 2017 11:31:55 -0600
Subject: writeback: only allow one inflight and pending full flush

When someone calls wakeup_flusher_threads() or
wakeup_flusher_threads_bdi(), they schedule writeback of all dirty
pages in the system (or on that bdi). If we are tight on memory, we
can get tons of these queued from kswapd/vmscan. This causes (at
least) two problems:

1) We consume a ton of memory just allocating writeback work items.
   We've seen as much as 600 million of these writeback work items
   pending. That's a lot of memory to pointlessly hold hostage,
   while the box is under memory pressure.

2) We spend so much time processing these work items, that we
   introduce a softlockup in writeback processing. This is because
   each of the writeback work items don't end up doing any work (it's
   hard when you have millions of identical ones coming in to the
   flush machinery), so we just sit in a tight loop pulling work
   items and deleting/freeing them.

Fix this by adding a 'start_all' bit to the writeback structure, and
set that when someone attempts to flush all dirty pages. The bit is
cleared when we start writeback on that work item. If the bit is
already set when we attempt to queue !nr_pages writeback, then we
simply ignore it.

This provides us one full flush in flight, with one pending as well,
and makes for more efficient handling of this type of writeback.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Chris Mason <clm@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c                | 25 +++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  1 +
 2 files changed, 26 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 424577152eb57..399619c975679 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,7 @@ struct wb_writeback_work {
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	unsigned int auto_free:1;	/* free on completion */
+	unsigned int start_all:1;	/* nr_pages == 0 (all) writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -951,6 +952,20 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	if (!wb_has_dirty_io(wb))
 		return;
 
+	/*
+	 * All callers of this function want to start writeback of all
+	 * dirty pages. Places like vmscan can call this at a very
+	 * high frequency, causing pointless allocations of tons of
+	 * work items and keeping the flusher threads busy retrieving
+	 * that work. Ensure that we only allow one of them pending and
+	 * inflight at the time. It doesn't matter if we race a little
+	 * bit on this, so use the faster separate test/set bit variants.
+	 */
+	if (test_bit(WB_start_all, &wb->state))
+		return;
+
+	set_bit(WB_start_all, &wb->state);
+
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
@@ -958,6 +973,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	work = kzalloc(sizeof(*work),
 		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
 	if (!work) {
+		clear_bit(WB_start_all, &wb->state);
 		trace_writeback_nowork(wb);
 		wb_wakeup(wb);
 		return;
@@ -968,6 +984,7 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	work->range_cyclic = 1;
 	work->reason	= reason;
 	work->auto_free	= 1;
+	work->start_all = 1;
 
 	wb_queue_work(wb, work);
 }
@@ -1821,6 +1838,14 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 		list_del_init(&work->list);
 	}
 	spin_unlock_bh(&wb->work_lock);
+
+	/*
+	 * Once we start processing a work item that had !nr_pages,
+	 * clear the wb state bit for that so we can allow more.
+	 */
+	if (work && work->start_all)
+		clear_bit(WB_start_all, &wb->state);
+
 	return work;
 }
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 866c433e7d322..420de5c7c7f9a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -24,6 +24,7 @@ enum wb_state {
 	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
+	WB_start_all,		/* nr_pages == 0 (all) work pending */
 };
 
 enum wb_congested_state {
-- 
cgit v1.2.3


From 4baa8bb13f41307f3eb62fe91f93a1a798ebef53 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:00 +0200
Subject: block, bfq: fix wrong init of saved start time for weight raising

This commit fixes a bug that causes bfq to fail to guarantee a high
responsiveness on some drives, if there is heavy random read+write I/O
in the background. More precisely, such a failure allowed this bug to
be found [1], but the bug may well cause other yet unreported
anomalies.

BFQ raises the weight of the bfq_queues associated with soft real-time
applications, to privilege the I/O, and thus reduce latency, for these
applications. This mechanism is named soft-real-time weight raising in
BFQ. A soft real-time period may happen to be nested into an
interactive weight raising period, i.e., it may happen that, when a
bfq_queue switches to a soft real-time weight-raised state, the
bfq_queue is already being weight-raised because deemed interactive
too. In this case, BFQ saves in a special variable
wr_start_at_switch_to_srt, the time instant when the interactive
weight-raising period started for the bfq_queue, i.e., the time
instant when BFQ started to deem the bfq_queue interactive. This value
is then used to check whether the interactive weight-raising period
would still be in progress when the soft real-time weight-raising
period ends.  If so, interactive weight raising is restored for the
bfq_queue. This restore is useful, in particular, because it prevents
bfq_queues from losing their interactive weight raising prematurely,
as a consequence of spurious, short-lived soft real-time
weight-raising periods caused by wrong detections as soft real-time.

If, instead, a bfq_queue switches to soft-real-time weight raising
while it *is not* already in an interactive weight-raising period,
then the variable wr_start_at_switch_to_srt has no meaning during the
following soft real-time weight-raising period. Unfortunately the
handling of this case is wrong in BFQ: not only the variable is not
flagged somehow as meaningless, but it is also set to the time when
the switch to soft real-time weight-raising occurs. This may cause an
interactive weight-raising period to be considered mistakenly as still
in progress, and thus a spurious interactive weight-raising period to
start for the bfq_queue, at the end of the soft-real-time
weight-raising period. In particular the spurious interactive
weight-raising period will be considered as still in progress, if the
soft-real-time weight-raising period does not last very long. The
bfq_queue will then be wrongly privileged and, if I/O bound, will
unjustly steal bandwidth to truly interactive or soft real-time
bfq_queues, harming responsiveness and low latency.

This commit fixes this issue by just setting wr_start_at_switch_to_srt
to minus infinity (farthest past time instant according to jiffies
macros): when the soft-real-time weight-raising period ends, certainly
no interactive weight-raising period will be considered as still in
progress.

[1] Background I/O Type: Random - Background I/O mix: Reads and writes
- Application to start: LibreOffice Writer in
http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 50 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index a4783da90ba88..c25955c25e033 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1202,6 +1202,24 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	return dur;
 }
 
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+	return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+	return jiffies - MAX_JIFFY_OFFSET;
+}
+
 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 					     struct bfq_queue *bfqq,
 					     unsigned int old_wr_coeff,
@@ -1216,7 +1234,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 		} else {
-			bfqq->wr_start_at_switch_to_srt = jiffies;
+			/*
+			 * No interactive weight raising in progress
+			 * here: assign minus infinity to
+			 * wr_start_at_switch_to_srt, to make sure
+			 * that, at the end of the soft-real-time
+			 * weight raising periods that is starting
+			 * now, no interactive weight-raising period
+			 * may be wrongly considered as still in
+			 * progress (and thus actually started by
+			 * mistake).
+			 */
+			bfqq->wr_start_at_switch_to_srt =
+				bfq_smallest_from_now();
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
 				BFQ_SOFTRT_WEIGHT_FACTOR;
 			bfqq->wr_cur_max_time =
@@ -2897,24 +2927,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
 		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
-	return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
-	return jiffies - MAX_JIFFY_OFFSET;
-}
-
 /**
  * bfq_bfqq_expire - expire a queue.
  * @bfqd: device owning the queue.
-- 
cgit v1.2.3


From 3e2bdd6dff239afd8386e8758eee69ad61e5a3d6 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:01 +0200
Subject: block, bfq: check and switch back to interactive wr also on queue
 split

As already explained in the message of commit "block, bfq: fix
wrong init of saved start time for weight raising", if a soft
real-time weight-raising period happens to be nested in a larger
interactive weight-raising period, then BFQ restores the interactive
weight raising at the end of the soft real-time weight raising. In
particular, BFQ checks whether the latter has ended only on request
dispatches.

Unfortunately, the above scheme fails to restore interactive weight
raising in the following corner case: if a bfq_queue, say Q,
1) Is merged with another bfq_queue while it is in a nested soft
real-time weight-raising period. The weight-raising state of Q is
then saved, and not considered any longer until a split occurs.
2) Is split from the other bfq_queue(s) at a time instant when its
soft real-time weight raising is already finished.
On the split, while resuming the previous, soft real-time
weight-raised state of the bfq_queue Q, BFQ checks whether the
current soft real-time weight-raising period is actually over. If so,
BFQ switches weight raising off for Q, *without* checking whether the
soft real-time period was actually nested in a non-yet-finished
interactive weight-raising period.

This commit addresses this issue by adding the above missing check in
bfq_queue splits, and restoring interactive weight raising if needed.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 87 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c25955c25e033..33b63bc4a64b6 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -724,6 +724,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 	}
 }
 
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	/*
+	 * Limit duration between 3 and 13 seconds. Tests show that
+	 * higher values than 13 seconds often yield the opposite of
+	 * the desired result, i.e., worsen responsiveness by letting
+	 * non-interactive and non-soft-real-time applications
+	 * preserve weight raising for a too long time interval.
+	 *
+	 * On the other end, lower values than 3 seconds make it
+	 * difficult for most interactive tasks to complete their jobs
+	 * before weight-raising finishes.
+	 */
+	if (dur > msecs_to_jiffies(13000))
+		dur = msecs_to_jiffies(13000);
+	else if (dur < msecs_to_jiffies(3000))
+		dur = msecs_to_jiffies(3000);
+
+	return dur;
+}
+
+/* switch back from soft real-time to interactive weight raising */
+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
+					  struct bfq_data *bfqd)
+{
+	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+	bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
+}
+
 static void
 bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 		      struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +788,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
 	    time_is_before_jiffies(bfqq->last_wr_start_finish +
 				   bfqq->wr_cur_max_time))) {
-		bfq_log_bfqq(bfqq->bfqd, bfqq,
-		    "resume state: switching off wr");
-
-		bfqq->wr_coeff = 1;
+		if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		    !bfq_bfqq_in_large_burst(bfqq) &&
+		    time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
+					     bfq_wr_duration(bfqd))) {
+			switch_back_to_interactive_wr(bfqq, bfqd);
+		} else {
+			bfqq->wr_coeff = 1;
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "resume state: switching off wr");
+		}
 	}
 
 	/* make sure weight will be updated, however we got here */
@@ -1173,35 +1217,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 	return wr_or_deserves_wr;
 }
 
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
-{
-	u64 dur;
-
-	if (bfqd->bfq_wr_max_time > 0)
-		return bfqd->bfq_wr_max_time;
-
-	dur = bfqd->RT_prod;
-	do_div(dur, bfqd->peak_rate);
-
-	/*
-	 * Limit duration between 3 and 13 seconds. Tests show that
-	 * higher values than 13 seconds often yield the opposite of
-	 * the desired result, i.e., worsen responsiveness by letting
-	 * non-interactive and non-soft-real-time applications
-	 * preserve weight raising for a too long time interval.
-	 *
-	 * On the other end, lower values than 3 seconds make it
-	 * difficult for most interactive tasks to complete their jobs
-	 * before weight-raising finishes.
-	 */
-	if (dur > msecs_to_jiffies(13000))
-		dur = msecs_to_jiffies(13000);
-	else if (dur < msecs_to_jiffies(3000))
-		dur = msecs_to_jiffies(3000);
-
-	return dur;
-}
-
 /*
  * Return the farthest future time instant according to jiffies
  * macros.
@@ -3501,11 +3516,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 					       bfq_wr_duration(bfqd)))
 				bfq_bfqq_end_wr(bfqq);
 			else {
-				/* switch back to interactive wr */
-				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-				bfqq->last_wr_start_finish =
-					bfqq->wr_start_at_switch_to_srt;
+				switch_back_to_interactive_wr(bfqq, bfqd);
 				bfqq->entity.prio_changed = 1;
 			}
 		}
-- 
cgit v1.2.3


From 894df937e06a56ed6f054a75a416aff84147c5a2 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:02 +0200
Subject: block, bfq: let early-merged queues be weight-raised on split too

A just-created bfq_queue, say Q, may happen to be merged with another
bfq_queue on the very first invocation of the function
__bfq_insert_request. In such a case, even if Q would clearly deserve
interactive weight raising (as it has just been created), the function
bfq_add_request does not make it to be invoked for Q, and thus to
activate weight raising for Q. As a consequence, when the state of Q
is saved for a possible future restore, after a split of Q from the
other bfq_queue(s), such a state happens to be (unjustly)
non-weight-raised. Then the bfq_queue will not enjoy any weight
raising on the split, even if should still be in an interactive
weight-raising period when the split occurs.

This commit solves this problem as follows, for a just-created
bfq_queue that is being early-merged: it stores directly, in the saved
state of the bfq_queue, the weight-raising state that would have been
assigned to the bfq_queue if not early-merged.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 33b63bc4a64b6..115747fe43c88 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2061,10 +2061,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
 	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
 	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
-	bic->saved_wr_coeff = bfqq->wr_coeff;
-	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
-	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
-	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+	if (unlikely(bfq_bfqq_just_created(bfqq) &&
+		     !bfq_bfqq_in_large_burst(bfqq))) {
+		/*
+		 * bfqq being merged right after being created: bfqq
+		 * would have deserved interactive weight raising, but
+		 * did not make it to be set in a weight-raised state,
+		 * because of this early merge.	Store directly the
+		 * weight-raising state that would have been assigned
+		 * to bfqq, so that to avoid that bfqq unjustly fails
+		 * to enjoy weight raising if split soon.
+		 */
+		bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+		bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
+		bic->saved_last_wr_start_finish = jiffies;
+	} else {
+		bic->saved_wr_coeff = bfqq->wr_coeff;
+		bic->saved_wr_start_at_switch_to_srt =
+			bfqq->wr_start_at_switch_to_srt;
+		bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+		bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+	}
 }
 
 static void
@@ -4150,7 +4167,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		new_bfqq->allocated++;
 		bfqq->allocated--;
 		new_bfqq->ref++;
-		bfq_clear_bfqq_just_created(bfqq);
 		/*
 		 * If the bic associated with the process
 		 * issuing this request still points to bfqq
@@ -4162,6 +4178,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
 			bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
 					bfqq, new_bfqq);
+
+		bfq_clear_bfqq_just_created(bfqq);
 		/*
 		 * rq is about to be enqueued into new_bfqq,
 		 * release rq reference on bfqq
-- 
cgit v1.2.3


From 7cb04004fa371a626c1a5ebe6d977f70285759ed Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Thu, 21 Sep 2017 11:04:03 +0200
Subject: block, bfq: decrease burst size when queues in burst exit

If many queues belonging to the same group happen to be created
shortly after each other, then the concurrent processes associated
with these queues have typically a common goal, and they get it done
as soon as possible if not hampered by device idling.  Examples are
processes spawned by git grep, or by systemd during boot. As for
device idling, this mechanism is currently necessary for weight
raising to succeed in its goal: privileging I/O.  In view of these
facts, BFQ does not provide the above queues with either weight
raising or device idling.

On the other hand, a burst of queue creations may be caused also by
the start-up of a complex application. In this case, these queues need
usually to be served one after the other, and as quickly as possible,
to maximise responsiveness. Therefore, in this case the best strategy
is to weight-raise all the queues created during the burst, i.e., the
exact opposite of the strategy for the above case.

To distinguish between the two cases, BFQ uses an empirical burst-size
threshold, found through extensive tests and monitoring of daily
usage. Only large bursts, i.e., burst with a size above this
threshold, are considered as generated by a high number of parallel
processes. In this respect, upstart-based boot proved to be rather
hard to detect as generating a large burst of queue creations, because
with upstart most of the queues created in a burst exit *before* the
next queues in the same burst are created. To address this issue, I
changed the burst-detection mechanism so as to not decrease the size
of the current burst even if one of the queues in the burst is
eliminated.

Unfortunately, this missing decrease causes false positives on very
fast systems: on the start-up of a complex application, such as
libreoffice writer, so many queues are created, served and exited
shortly after each other, that a large burst of queue creations is
wrongly detected as occurring. These false positives just disappear if
the size of a burst is decreased when one of the queues in the burst
exits. This commit restores the missing burst-size decrease, relying
of the fact that upstart is apparently unlikely to be used on systems
running this and future versions of the kernel.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Mirko Montanari <mirkomontanari91@gmail.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 115747fe43c88..70f9177c4f5b2 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3725,16 +3725,10 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
-	if (bfq_bfqq_sync(bfqq))
-		/*
-		 * The fact that this queue is being destroyed does not
-		 * invalidate the fact that this queue may have been
-		 * activated during the current burst. As a consequence,
-		 * although the queue does not exist anymore, and hence
-		 * needs to be removed from the burst list if there,
-		 * the burst size has not to be decremented.
-		 */
+	if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) {
 		hlist_del_init(&bfqq->burst_list_node);
+		bfqq->bfqd->burst_size--;
+	}
 
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
-- 
cgit v1.2.3


From 9c9883744dda1cc38339a448dd8435140537027e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Oct 2017 10:47:00 +0200
Subject: block: move __elv_next_request to blk-core.c

No need to have this helper inline in a header.  Also drop the __ prefix.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 block/blk.h      | 39 ---------------------------------------
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 048be4aa60244..14f7674fa0b14 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2517,6 +2517,45 @@ void blk_account_io_start(struct request *rq, bool new_io)
 	part_stat_unlock();
 }
 
+static struct request *elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
+
+	WARN_ON_ONCE(q->mq_ops);
+
+	while (1) {
+		if (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			return rq;
+		}
+
+		/*
+		 * Flush request is running and flush request isn't queueable
+		 * in the drive, we can hold the queue till flush request is
+		 * finished. Even we don't do this, driver can't dispatch next
+		 * requests and will requeue them. And this can improve
+		 * throughput too. For example, we have request flush1, write1,
+		 * flush 2. flush1 is dispatched, then queue is hold, write1
+		 * isn't inserted to queue. After flush1 is finished, flush2
+		 * will be dispatched. Since disk cache is already clean,
+		 * flush2 will be finished very soon, so looks like flush2 is
+		 * folded to flush1.
+		 * Since the queue is hold, a flag is set to indicate the queue
+		 * should be restarted later. Please see flush_end_io() for
+		 * details.
+		 */
+		if (fq->flush_pending_idx != fq->flush_running_idx &&
+				!queue_flush_queueable(q)) {
+			fq->flush_queue_delayed = 1;
+			return NULL;
+		}
+		if (unlikely(blk_queue_bypass(q)) ||
+		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
+			return NULL;
+	}
+}
+
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
@@ -2538,8 +2577,7 @@ struct request *blk_peek_request(struct request_queue *q)
 	lockdep_assert_held(q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
 
-	while ((rq = __elv_next_request(q)) != NULL) {
-
+	while ((rq = elv_next_request(q)) != NULL) {
 		rq = blk_pm_peek_request(q, rq);
 		if (!rq)
 			break;
diff --git a/block/blk.h b/block/blk.h
index fcb9775b997d2..fda5a4632aba3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -148,45 +148,6 @@ static inline void blk_clear_rq_complete(struct request *rq)
 
 void blk_insert_flush(struct request *rq);
 
-static inline struct request *__elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	while (1) {
-		if (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			return rq;
-		}
-
-		/*
-		 * Flush request is running and flush request isn't queueable
-		 * in the drive, we can hold the queue till flush request is
-		 * finished. Even we don't do this, driver can't dispatch next
-		 * requests and will requeue them. And this can improve
-		 * throughput too. For example, we have request flush1, write1,
-		 * flush 2. flush1 is dispatched, then queue is hold, write1
-		 * isn't inserted to queue. After flush1 is finished, flush2
-		 * will be dispatched. Since disk cache is already clean,
-		 * flush2 will be finished very soon, so looks like flush2 is
-		 * folded to flush1.
-		 * Since the queue is hold, a flag is set to indicate the queue
-		 * should be restarted later. Please see flush_end_io() for
-		 * details.
-		 */
-		if (fq->flush_pending_idx != fq->flush_running_idx &&
-				!queue_flush_queueable(q)) {
-			fq->flush_queue_delayed = 1;
-			return NULL;
-		}
-		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
-			return NULL;
-	}
-}
-
 static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
-- 
cgit v1.2.3


From 8ae4e4477d8f5cc7736816a0492f82934ca32ab7 Mon Sep 17 00:00:00 2001
From: Marc Olson <marcolso@amazon.com>
Date: Wed, 6 Sep 2017 17:23:56 -0700
Subject: nvme: update timeout module parameter type

The underlying blk_mq_tag_set, and request timeout parameters support an
unsigned int. Extend the size of the nvme module parameters for io and
admin commands to match.

Signed-off-by: Marc Olson <marcolso@amazon.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 8 ++++----
 drivers/nvme/host/nvme.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bb2aad0786373..26c8913435b25 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -34,13 +34,13 @@
 
 #define NVME_MINORS		(1U << MINORBITS)
 
-unsigned char admin_timeout = 60;
-module_param(admin_timeout, byte, 0644);
+unsigned int admin_timeout = 60;
+module_param(admin_timeout, uint, 0644);
 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
 EXPORT_SYMBOL_GPL(admin_timeout);
 
-unsigned char nvme_io_timeout = 30;
-module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+unsigned int nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
 EXPORT_SYMBOL_GPL(nvme_io_timeout);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d3f3c44475157..df787f39f4c1e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -21,10 +21,10 @@
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 
-extern unsigned char nvme_io_timeout;
+extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
 
-extern unsigned char admin_timeout;
+extern unsigned int admin_timeout;
 #define ADMIN_TIMEOUT	(admin_timeout * HZ)
 
 #define NVME_DEFAULT_KATO	5
-- 
cgit v1.2.3


From 786456325b9f05843ec722269a3ccd77c31b9c6d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Sep 2017 18:04:43 -0700
Subject: nvme: use menu Kconfig interface

Add a menu interface for NVME host and target support so that it is
presented to users more like other Kconfig symbols.
This makes the Device Driver menu less cluttered (easier to read)
and keeps all of these symbols grouped together.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index b7c78a5b1f7a7..04008e0bbe817 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,2 +1,6 @@
+menu "NVME Support"
+
 source "drivers/nvme/host/Kconfig"
 source "drivers/nvme/target/Kconfig"
+
+endmenu
-- 
cgit v1.2.3


From 3375e29c280894f8d5e65c6920630a42db069599 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 11 Sep 2017 16:14:50 -0700
Subject: nvmet: bump NVMET_NR_QUEUES to 128

Raise the max number of IO queues to 128. There are several hosts with
more than 64 cpus/threads.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/nvmet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7b8e20adf7602..e342f02845c19 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -314,7 +314,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf,
 u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 
 #define NVMET_QUEUE_SIZE	1024
-#define NVMET_NR_QUEUES		64
+#define NVMET_NR_QUEUES		128
 #define NVMET_MAX_CMD		NVMET_QUEUE_SIZE
 #define NVMET_KAS		10
 #define NVMET_DISC_KATO		120
-- 
cgit v1.2.3


From d1f1071f8151355ec149117238e9c5aa6404ca9f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 24 Sep 2017 16:15:55 +0300
Subject: nvme-fabrics: request transport module

Help userspace to make sure transport module is loaded.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 555c976cc2ee7..8bca36a469245 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -841,6 +841,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 	if (ret)
 		goto out_free_opts;
 
+
+	request_module("nvme-%s", opts->transport);
+
 	/*
 	 * Check the generic options first as we need a valid transport for
 	 * the lookup below.  Then clear the generic flags so that transport
-- 
cgit v1.2.3


From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 14 Sep 2017 10:38:42 -0700
Subject: nvme-fc: add uevent for auto-connect

To support auto-connecting to FC-NVME devices upon their dynamic
appearance, add a uevent that can kick off connection scripts.
uevent is posted against the fc_udev device.

patch set tested with the following rule to kick an nvme-cli connect-all
for the FC initiator and FC target ports. This is just an example for
testing and not intended for real life use.

ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \
        ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \
	RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'"

I will post proposed udev/systemd scripts for possible kernel support.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 49 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme-fc-driver.h |  2 ++
 2 files changed, 51 insertions(+)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index af075e9989449..f546eecb1f82b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -452,6 +452,36 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
 
+/*
+ * TRADDR strings, per FC-NVME are fixed format:
+ *   "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters
+ * udev event will only differ by prefix of what field is
+ * being specified:
+ *    "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters
+ *  19 + 43 + null_fudge = 64 characters
+ */
+#define FCNVME_TRADDR_LENGTH		64
+
+static void
+nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
+		struct nvme_fc_rport *rport)
+{
+	char hostaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_HOST_TRADDR=...*/
+	char tgtaddr[FCNVME_TRADDR_LENGTH];	/* NVMEFC_TRADDR=...*/
+	char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL };
+
+	if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY))
+		return;
+
+	snprintf(hostaddr, sizeof(hostaddr),
+		"NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		lport->localport.node_name, lport->localport.port_name);
+	snprintf(tgtaddr, sizeof(tgtaddr),
+		"NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx",
+		rport->remoteport.node_name, rport->remoteport.port_name);
+	kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
+}
+
 /**
  * nvme_fc_register_remoteport - transport entry point called by an
  *                              LLDD to register the existence of a NVME
@@ -516,6 +546,8 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	list_add_tail(&newrec->endp_list, &lport->endp_list);
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
+	nvme_fc_signal_discovery_scan(lport, newrec);
+
 	*portptr = &newrec->remoteport;
 	return 0;
 
@@ -634,6 +666,23 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
 
+/**
+ * nvme_fc_rescan_remoteport - transport entry point called by an
+ *                              LLDD to request a nvme device rescan.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              rescanned.
+ *
+ * Returns: N/A
+ */
+void
+nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(remoteport);
+
+	nvme_fc_signal_discovery_scan(rport->lport, rport);
+}
+EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
+
 
 /* *********************** FC-NVME DMA Handling **************************** */
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index a726f96010d59..4ea03b9a5c8cb 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -446,6 +446,8 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 
 int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
 
+void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport);
+
 
 
 /*
-- 
cgit v1.2.3


From 5f5685569ae8fccb0344373d823f2e4c59bb3d8e Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 14 Sep 2017 10:38:41 -0700
Subject: nvme-fc: create fc class and transport device

Added a new fc class and a device node for udev events under it.  I
expect the fc class will eventually be the location where the FC SCSI and
FC NVME merge in the future. Therefore names are kept somewhat generic.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f546eecb1f82b..2dc9932e48182 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -213,6 +213,13 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
 
 
+/*
+ * These items are short-term. They will eventually be moved into
+ * a generic FC class. See comments in module init.
+ */
+static struct class *fc_class;
+static struct device *fc_udev_device;
+
 
 /* *********************** FC-NVME Port Management ************************ */
 
@@ -3046,7 +3053,50 @@ static struct nvmf_transport_ops nvme_fc_transport = {
 
 static int __init nvme_fc_init_module(void)
 {
-	return nvmf_register_transport(&nvme_fc_transport);
+	int ret;
+
+	/*
+	 * NOTE:
+	 * It is expected that in the future the kernel will combine
+	 * the FC-isms that are currently under scsi and now being
+	 * added to by NVME into a new standalone FC class. The SCSI
+	 * and NVME protocols and their devices would be under this
+	 * new FC class.
+	 *
+	 * As we need something to post FC-specific udev events to,
+	 * specifically for nvme probe events, start by creating the
+	 * new device class.  When the new standalone FC class is
+	 * put in place, this code will move to a more generic
+	 * location for the class.
+	 */
+	fc_class = class_create(THIS_MODULE, "fc");
+	if (IS_ERR(fc_class)) {
+		pr_err("couldn't register class fc\n");
+		return PTR_ERR(fc_class);
+	}
+
+	/*
+	 * Create a device for the FC-centric udev events
+	 */
+	fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
+				"fc_udev_device");
+	if (IS_ERR(fc_udev_device)) {
+		pr_err("couldn't create fc_udev device!\n");
+		ret = PTR_ERR(fc_udev_device);
+		goto out_destroy_class;
+	}
+
+	ret = nvmf_register_transport(&nvme_fc_transport);
+	if (ret)
+		goto out_destroy_device;
+
+	return 0;
+
+out_destroy_device:
+	device_destroy(fc_class, MKDEV(0, 0));
+out_destroy_class:
+	class_destroy(fc_class);
+	return ret;
 }
 
 static void __exit nvme_fc_exit_module(void)
@@ -3059,6 +3109,9 @@ static void __exit nvme_fc_exit_module(void)
 
 	ida_destroy(&nvme_fc_local_port_cnt);
 	ida_destroy(&nvme_fc_ctrl_cnt);
+
+	device_destroy(fc_class, MKDEV(0, 0));
+	class_destroy(fc_class);
 }
 
 module_init(nvme_fc_init_module);
-- 
cgit v1.2.3


From a7af0af32171c17d881e3e58b0925c4a44fb5a42 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 6 Sep 2017 10:00:22 +0200
Subject: blk-mq: attempt to fix atomic flag memory ordering

Attempt to untangle the ordering in blk-mq. The patch introducing the
single smp_mb__before_atomic() is obviously broken in that it doesn't
clearly specify a pairing barrier and an obtained guarantee.

The comment is further misleading in that it hints that the
deadline store and the COMPLETE store also need to be ordered, but
AFAICT there is no such dependency. However what does appear to be
important is the clear happening _after_ the store, and that worked by
pure accident.

This clarifies blk_mq_start_request() -- we should not get there with
STARTING set -- this simplifies the code and makes the barrier usage
sane (the old code could be read to allow not having _any_ atomic after
the barrier, in which case the barrier hasn't got anything to order). We
then also introduce the missing pairing barrier for it.

Also down-grade the barrier to smp_wmb(), this is cheaper for
PowerPC/ARM and doesn't cost anything extra on x86.

And it documents the STARTING vs COMPLETE ordering. Although I've not
been entirely successful in reverse engineering the blk-mq state
machine so there might still be more funnies around timeout vs
requeue.

If I got anything wrong, feel free to educate me by adding comments to
clarify things ;-)

Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Fixes: 538b75341835 ("blk-mq: request deadline must be visible before marking rq as started")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c      | 52 ++++++++++++++++++++++++++++++++++++++++------------
 block/blk-timeout.c |  2 +-
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7f01d69879d66..a7b46b754a7c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -596,22 +596,32 @@ void blk_mq_start_request(struct request *rq)
 
 	blk_add_timer(rq);
 
-	/*
-	 * Ensure that ->deadline is visible before set the started
-	 * flag and clear the completed flag.
-	 */
-	smp_mb__before_atomic();
+	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
 
 	/*
 	 * Mark us as started and clear complete. Complete might have been
 	 * set if requeue raced with timeout, which then marked it as
 	 * complete. So be sure to clear complete again when we start
 	 * the request, otherwise we'll ignore the completion event.
+	 *
+	 * Ensure that ->deadline is visible before we set STARTED, such that
+	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
+	 * it observes STARTED.
 	 */
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+	smp_wmb();
+	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+		/*
+		 * Coherence order guarantees these consecutive stores to a
+		 * single variable propagate in the specified order. Thus the
+		 * clear_bit() is ordered _after_ the set bit. See
+		 * blk_mq_check_expired().
+		 *
+		 * (the bits must be part of the same byte for this to be
+		 * true).
+		 */
 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	}
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -781,10 +791,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
+	unsigned long deadline;
 
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 
+	/*
+	 * Ensures that if we see STARTED we must also see our
+	 * up-to-date deadline, see blk_mq_start_request().
+	 */
+	smp_rmb();
+
+	deadline = READ_ONCE(rq->deadline);
+
 	/*
 	 * The rq being checked may have been freed and reallocated
 	 * out already here, we avoid this race by checking rq->deadline
@@ -798,11 +817,20 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	 *   and clearing the flag in blk_mq_start_request(), so
 	 *   this rq won't be timed out too.
 	 */
-	if (time_after_eq(jiffies, rq->deadline)) {
-		if (!blk_mark_rq_complete(rq))
+	if (time_after_eq(jiffies, deadline)) {
+		if (!blk_mark_rq_complete(rq)) {
+			/*
+			 * Again coherence order ensures that consecutive reads
+			 * from the same variable must be in that order. This
+			 * ensures that if we see COMPLETE clear, we must then
+			 * see STARTED set and we'll ignore this timeout.
+			 *
+			 * (There's also the MB implied by the test_and_clear())
+			 */
 			blk_mq_rq_timed_out(rq, reserved);
-	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
-		data->next = rq->deadline;
+		}
+	} else if (!data->next_set || time_after(data->next, deadline)) {
+		data->next = deadline;
 		data->next_set = 1;
 	}
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 17ec83bb09002..e3e9c9771d361 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -211,7 +211,7 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	req->deadline = jiffies + req->timeout;
+	WRITE_ONCE(req->deadline, jiffies + req->timeout);
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
-- 
cgit v1.2.3


From fc13457f74dcf054b0d17efb7b94b46fdf17f412 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 4 Oct 2017 11:22:24 -0600
Subject: blk-mq: document the need to have STARTED and COMPLETED share a byte

For memory ordering guarantees on stores, we need to ensure that
these two bits share the same byte of storage in the unsigned
long. Add a comment as to why, and a BUILD_BUG_ON() to ensure that
we don't violate this requirement.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 ++++++
 block/blk.h    | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7b46b754a7c8..076cbab9c3e0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2923,6 +2923,12 @@ EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 static int __init blk_mq_init(void)
 {
+	/*
+	 * See comment in block/blk.h rq_atomic_flags enum
+	 */
+	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
+			(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
+
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/block/blk.h b/block/blk.h
index fda5a4632aba3..6ac43dfd68a7d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -122,8 +122,15 @@ void blk_account_io_done(struct request *req);
  * Internal atomic flags for request handling
  */
 enum rq_atomic_flags {
+	/*
+	 * Keep these two bits first - not because we depend on the
+	 * value of them, but we do depend on them being in the same
+	 * byte of storage to ensure ordering on writes. Keeping them
+	 * first will achieve that nicely.
+	 */
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
+
 	REQ_ATOM_POLL_SLEPT,
 };
 
-- 
cgit v1.2.3


From 85009b4f5f0399669a44f07cb9a5622c0e71d419 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 02:09:06 -0600
Subject: writeback: eliminate work item allocation in bd_start_writeback()

Handle start-all writeback like we do periodic or kupdate
style writeback - by marking the bdi_writeback as needing a full
flush, and simply waking the thread. This eliminates the need to
allocate and queue a specific work item just for this purpose.

After this change, we truly only ever have one of them running at
any point in time. We mark the need to start all flushes, and the
writeback thread will clear it once it has processed the request.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c                | 71 +++++++++++++++++++---------------------
 include/linux/backing-dev-defs.h | 23 +++++++++++++
 include/linux/writeback.h        | 22 -------------
 include/trace/events/writeback.h |  1 -
 4 files changed, 57 insertions(+), 60 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 399619c975679..9e24d604c59c6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,7 +53,6 @@ struct wb_writeback_work {
 	unsigned int for_background:1;
 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	unsigned int auto_free:1;	/* free on completion */
-	unsigned int start_all:1;	/* nr_pages == 0 (all) writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -947,8 +946,6 @@ static unsigned long get_nr_dirty_pages(void)
 
 static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 {
-	struct wb_writeback_work *work;
-
 	if (!wb_has_dirty_io(wb))
 		return;
 
@@ -958,35 +955,14 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
 	 * high frequency, causing pointless allocations of tons of
 	 * work items and keeping the flusher threads busy retrieving
 	 * that work. Ensure that we only allow one of them pending and
-	 * inflight at the time. It doesn't matter if we race a little
-	 * bit on this, so use the faster separate test/set bit variants.
+	 * inflight at the time.
 	 */
-	if (test_bit(WB_start_all, &wb->state))
+	if (test_bit(WB_start_all, &wb->state) ||
+	    test_and_set_bit(WB_start_all, &wb->state))
 		return;
 
-	set_bit(WB_start_all, &wb->state);
-
-	/*
-	 * This is WB_SYNC_NONE writeback, so if allocation fails just
-	 * wakeup the thread for old dirty data writeback
-	 */
-	work = kzalloc(sizeof(*work),
-		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	if (!work) {
-		clear_bit(WB_start_all, &wb->state);
-		trace_writeback_nowork(wb);
-		wb_wakeup(wb);
-		return;
-	}
-
-	work->sync_mode	= WB_SYNC_NONE;
-	work->nr_pages	= wb_split_bdi_pages(wb, get_nr_dirty_pages());
-	work->range_cyclic = 1;
-	work->reason	= reason;
-	work->auto_free	= 1;
-	work->start_all = 1;
-
-	wb_queue_work(wb, work);
+	wb->start_all_reason = reason;
+	wb_wakeup(wb);
 }
 
 /**
@@ -1838,14 +1814,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 		list_del_init(&work->list);
 	}
 	spin_unlock_bh(&wb->work_lock);
-
-	/*
-	 * Once we start processing a work item that had !nr_pages,
-	 * clear the wb state bit for that so we can allow more.
-	 */
-	if (work && work->start_all)
-		clear_bit(WB_start_all, &wb->state);
-
 	return work;
 }
 
@@ -1901,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	return 0;
 }
 
+static long wb_check_start_all(struct bdi_writeback *wb)
+{
+	long nr_pages;
+
+	if (!test_bit(WB_start_all, &wb->state))
+		return 0;
+
+	nr_pages = get_nr_dirty_pages();
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= wb_split_bdi_pages(wb, nr_pages),
+			.sync_mode	= WB_SYNC_NONE,
+			.range_cyclic	= 1,
+			.reason		= wb->start_all_reason,
+		};
+
+		nr_pages = wb_writeback(wb, &work);
+	}
+
+	clear_bit(WB_start_all, &wb->state);
+	return nr_pages;
+}
+
+
 /*
  * Retrieve work items and do the writeback they describe
  */
@@ -1916,6 +1908,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 		finish_writeback_work(wb, work);
 	}
 
+	/*
+	 * Check for a flush-everything request
+	 */
+	wrote += wb_check_start_all(wb);
+
 	/*
 	 * Check for periodic writeback, kupdated() style
 	 */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 420de5c7c7f9a..b7c7be6f5986a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -44,6 +44,28 @@ enum wb_stat_item {
 
 #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
+/*
+ * why some writeback work was initiated
+ */
+enum wb_reason {
+	WB_REASON_BACKGROUND,
+	WB_REASON_VMSCAN,
+	WB_REASON_SYNC,
+	WB_REASON_PERIODIC,
+	WB_REASON_LAPTOP_TIMER,
+	WB_REASON_FREE_MORE_MEM,
+	WB_REASON_FS_FREE_SPACE,
+	/*
+	 * There is no bdi forker thread any more and works are done
+	 * by emergency worker, however, this is TPs userland visible
+	 * and we'll be exposing exactly the same information,
+	 * so it has a mismatch name.
+	 */
+	WB_REASON_FORKER_THREAD,
+
+	WB_REASON_MAX,
+};
+
 /*
  * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
  * wb's can operate mostly independently but should share the congested
@@ -116,6 +138,7 @@ struct bdi_writeback {
 
 	struct fprop_local_percpu completions;
 	int dirty_exceeded;
+	enum wb_reason start_all_reason;
 
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9c0091678af48..dd1d2c23f7431 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -41,28 +41,6 @@ enum writeback_sync_modes {
 	WB_SYNC_ALL,	/* Wait on every mapping */
 };
 
-/*
- * why some writeback work was initiated
- */
-enum wb_reason {
-	WB_REASON_BACKGROUND,
-	WB_REASON_VMSCAN,
-	WB_REASON_SYNC,
-	WB_REASON_PERIODIC,
-	WB_REASON_LAPTOP_TIMER,
-	WB_REASON_FREE_MORE_MEM,
-	WB_REASON_FS_FREE_SPACE,
-	/*
-	 * There is no bdi forker thread any more and works are done
-	 * by emergency worker, however, this is TPs userland visible
-	 * and we'll be exposing exactly the same information,
-	 * so it has a mismatch name.
-	 */
-	WB_REASON_FORKER_THREAD,
-
-	WB_REASON_MAX,
-};
-
 /*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 9b57f014d79d0..19a0ea08e0989 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -286,7 +286,6 @@ DEFINE_EVENT(writeback_class, name, \
 	TP_PROTO(struct bdi_writeback *wb), \
 	TP_ARGS(wb))
 
-DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 
 TRACE_EVENT(writeback_bdi_register,
-- 
cgit v1.2.3


From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 23:39:05 -0600
Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads

This tunable has been obsolete since 2.6.32, and writes to the
file have been failing and complaining in dmesg since then:

nr_pdflush_threads exported in /proc is scheduled for removal

That was 8 years ago. Remove the file ABI obsolete notice, and
the sysfs file.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads | 5 -----
 kernel/sysctl.c                                           | 5 -----
 2 files changed, 10 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads

diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
deleted file mode 100644
index b0b0eeb20fe34..0000000000000
--- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
+++ /dev/null
@@ -1,5 +0,0 @@
-What:		/proc/sys/vm/nr_pdflush_threads
-Date:		June 2012
-Contact:	Wanpeng Li <liwp@linux.vnet.ibm.com>
-Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
-             exported in /proc/sys/vm/ should be removed.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157f..a5dd8d82c2538 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= dirtytime_interval_handler,
 		.extra1		= &zero,
 	},
-	{
-		.procname       = "nr_pdflush_threads",
-		.mode           = 0444 /* read-only */,
-		.proc_handler   = pdflush_proc_obsolete,
-	},
 	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
-- 
cgit v1.2.3


From 469d0ef06debe29ec60350b9976a520c4ad5a1e8 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 26 Sep 2017 21:50:45 -0700
Subject: nvme-fc: move remote port get/put/free location

move nvme_fc_rport_get/put and rport free to higher in the file to
avoid adding prototypes to resolve references in upcoming code additions

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 78 +++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 2dc9932e48182..b8e0822127c1a 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -489,6 +489,45 @@ nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport,
 	kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp);
 }
 
+static void
+nvme_fc_free_rport(struct kref *ref)
+{
+	struct nvme_fc_rport *rport =
+		container_of(ref, struct nvme_fc_rport, ref);
+	struct nvme_fc_lport *lport =
+			localport_to_lport(rport->remoteport.localport);
+	unsigned long flags;
+
+	WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&rport->ctrl_list));
+
+	/* remove from lport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&rport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	/* let the LLDD know we've finished tearing it down */
+	lport->ops->remoteport_delete(&rport->remoteport);
+
+	ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
+
+	kfree(rport);
+
+	nvme_fc_lport_put(lport);
+}
+
+static void
+nvme_fc_rport_put(struct nvme_fc_rport *rport)
+{
+	kref_put(&rport->ref, nvme_fc_free_rport);
+}
+
+static int
+nvme_fc_rport_get(struct nvme_fc_rport *rport)
+{
+	return kref_get_unless_zero(&rport->ref);
+}
+
 /**
  * nvme_fc_register_remoteport - transport entry point called by an
  *                              LLDD to register the existence of a NVME
@@ -568,45 +607,6 @@ out_reghost_failed:
 }
 EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
 
-static void
-nvme_fc_free_rport(struct kref *ref)
-{
-	struct nvme_fc_rport *rport =
-		container_of(ref, struct nvme_fc_rport, ref);
-	struct nvme_fc_lport *lport =
-			localport_to_lport(rport->remoteport.localport);
-	unsigned long flags;
-
-	WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
-	WARN_ON(!list_empty(&rport->ctrl_list));
-
-	/* remove from lport list */
-	spin_lock_irqsave(&nvme_fc_lock, flags);
-	list_del(&rport->endp_list);
-	spin_unlock_irqrestore(&nvme_fc_lock, flags);
-
-	/* let the LLDD know we've finished tearing it down */
-	lport->ops->remoteport_delete(&rport->remoteport);
-
-	ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
-
-	kfree(rport);
-
-	nvme_fc_lport_put(lport);
-}
-
-static void
-nvme_fc_rport_put(struct nvme_fc_rport *rport)
-{
-	kref_put(&rport->ref, nvme_fc_free_rport);
-}
-
-static int
-nvme_fc_rport_get(struct nvme_fc_rport *rport)
-{
-	return kref_get_unless_zero(&rport->ref);
-}
-
 static int
 nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
 {
-- 
cgit v1.2.3


From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2017 21:22:52 +0200
Subject: block: remove QUEUE_FLAG_STACKABLE

We already have a queue_is_rq_based helper to check if a request_queue
is request based, so we can remove the flag for it.

Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/elevator.c       |  2 +-
 drivers/md/dm-rq.c     |  2 +-
 drivers/md/dm-table.c  | 15 +--------------
 drivers/md/dm.c        | 11 -----------
 include/linux/blkdev.h |  5 -----
 6 files changed, 3 insertions(+), 33 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e730956433..7f4a1ba532afc 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -54,7 +54,6 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(NOMERGES),
 	QUEUE_FLAG_NAME(SAME_COMP),
 	QUEUE_FLAG_NAME(FAIL_IO),
-	QUEUE_FLAG_NAME(STACKABLE),
 	QUEUE_FLAG_NAME(NONROT),
 	QUEUE_FLAG_NAME(IO_STAT),
 	QUEUE_FLAG_NAME(DISCARD),
diff --git a/block/elevator.c b/block/elevator.c
index 153926a909011..7ae50eb2732bb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1118,7 +1118,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!blk_queue_stackable(q))
+	if (!queue_is_rq_based(q))
 		return sprintf(name, "none\n");
 
 	if (!q->elevator)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index eadfcfd106fff..9d32f25489c27 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
 
 int dm_request_based(struct mapped_device *md)
 {
-	return blk_queue_stackable(md->queue);
+	return queue_is_rq_based(md->queue);
 }
 
 static void dm_old_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ef7b8f201f73a..75281828f2cbc 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1000,7 +1000,7 @@ verify_rq_based:
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
-		if (!blk_queue_stackable(q)) {
+		if (!queue_is_rq_based(q)) {
 			DMERR("table load rejected: including"
 			      " non-request-stackable devices");
 			return -EINVAL;
@@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	 */
 	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
 		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
-
-	/*
-	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
-	 * visible to other CPUs because, once the flag is set, incoming bios
-	 * are processed by request-based dm, which refers to the queue
-	 * settings.
-	 * Until the flag set, bios are passed to bio-based dm and queued to
-	 * md->deferred where queue settings are not needed yet.
-	 * Those bios are passed to request-based dm at the resume time.
-	 */
-	smp_mb();
-	if (dm_table_request_based(t))
-		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6e54145969c5c..8d07ad61221c6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work);
 
 void dm_init_md_queue(struct mapped_device *md)
 {
-	/*
-	 * Request-based dm devices cannot be stacked on top of bio-based dm
-	 * devices.  The type of this dm device may not have been decided yet.
-	 * The type is decided at the first table loading time.
-	 * To prevent problematic device stacking, clear the queue flag
-	 * for request stacking support until then.
-	 *
-	 * This queue is new, so no concurrency on the queue_flags.
-	 */
-	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
-
 	/*
 	 * Initialize data that will only be used by a non-blk-mq DM queue
 	 * - must do so here (in alloc_dev callchain) before queue is used
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02fa42d24b52f..9fb71fc7d0e85 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -609,7 +609,6 @@ struct request_queue {
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
-#define QUEUE_FLAG_STACKABLE	8	/* supports request stacking */
 #define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     10	/* do IO stats */
@@ -633,12 +632,10 @@ struct request_queue {
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_POLL))
 
@@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
-#define blk_queue_stackable(q)	\
-	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secure_erase(q) \
 	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
-- 
cgit v1.2.3


From 775d3a35dc3e13de55ec0e061c59e36faa7dd7f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 6 Oct 2017 08:15:15 -0600
Subject: backing-dev: kill unused pdflush_proc_obsolete()

After commit b35bd0d9f8a8, pdflush_proc_obsolete() is no longer
used. Kill the function and declaration.

Reported-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev.h |  2 --
 mm/backing-dev.c            | 20 --------------------
 2 files changed, 22 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 157e950a70dc1..872afa41abc2d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -172,8 +172,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
-int pdflush_proc_obsolete(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e19606bb41a0b..74b52dfd5852d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1072,23 +1072,3 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
-
-int pdflush_proc_obsolete(struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	char kbuf[] = "0\n";
-
-	if (*ppos || *lenp < sizeof(kbuf)) {
-		*lenp = 0;
-		return 0;
-	}
-
-	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
-		return -EFAULT;
-	pr_warn_once("%s exported in /proc is scheduled for removal\n",
-		     table->procname);
-
-	*lenp = 2;
-	*ppos += *lenp;
-	return 2;
-}
-- 
cgit v1.2.3


From 4b14a5c5d57f4fd6929db3427ba4d7c3775b4680 Mon Sep 17 00:00:00 2001
From: Tim Hansen <devtimhansen@gmail.com>
Date: Thu, 5 Oct 2017 14:09:20 -0400
Subject: block: remove unnecessary NULL checks in bioset_integrity_free()

mempool_destroy() already checks for a NULL value being passed in, this
eliminates duplicate checks.

This was caught by running make coccicheck M=block/ on linus' tree on
commit 77ede3a014a32746002f7889211f0cecf4803163 (current head as of this
patch).

Reviewed-by: Kyle Fortin <kyle.fortin@oracle.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5df32907ff3bf..23b42e8aa03e7 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-
-	if (bs->bvec_integrity_pool)
-		mempool_destroy(bs->bvec_integrity_pool);
+	mempool_destroy(bs->bio_integrity_pool);
+	mempool_destroy(bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
-- 
cgit v1.2.3


From 4078def82f352cf5007691635da290a109511bc5 Mon Sep 17 00:00:00 2001
From: Tim Hansen <devtimhansen@gmail.com>
Date: Fri, 6 Oct 2017 14:45:13 -0400
Subject: block/bio: Remove null checks before mempool_destroy in bioset_free

This patch removes redundant checks for null values on bio_pool and
bvec_pool.

Found using make coccicheck M=block/ on linux-net tree on the
next-20170929 tag.

Signed-off-by: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8338304ea2563..bf0dbe8f78f8e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1928,11 +1928,8 @@ void bioset_free(struct bio_set *bs)
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
 
-	if (bs->bio_pool)
-		mempool_destroy(bs->bio_pool);
-
-	if (bs->bvec_pool)
-		mempool_destroy(bs->bvec_pool);
+	mempool_destroy(bs->bio_pool);
+	mempool_destroy(bs->bvec_pool);
 
 	bioset_integrity_free(bs);
 	bio_put_slab(bs);
-- 
cgit v1.2.3


From 94af584692091347baea4d810b9fc6e0f5483d42 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 10 Oct 2017 05:44:13 +0800
Subject: writeback: schedule periodic writeback with sysctl

After disable periodic writeback by writing 0 to
dirty_writeback_centisecs, the handler wb_workfn() will not be
entered again until the dirty background limit reaches or
sync syscall is executed or no enough free memory available or
vmscan is triggered.

So the periodic writeback can't be enabled by writing a non-zero
value to dirty_writeback_centisecs.
As it can be disabled by sysctl, it should be able to enable by
sysctl as well.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/page-writeback.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8d1fc593bce8f..622a18c114caa 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1972,8 +1972,14 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, buffer, length, ppos);
-	return 0;
+	unsigned int old_interval = dirty_writeback_interval;
+	int ret;
+
+	ret = proc_dointvec(table, write, buffer, length, ppos);
+	if (!ret && !old_interval && dirty_writeback_interval)
+		wakeup_flusher_threads(WB_REASON_PERIODIC);
+
+	return ret;
 }
 
 #ifdef CONFIG_BLOCK
-- 
cgit v1.2.3


From b5dc5d4d1f4ff9032eb6c21a3c571a1317dc9289 Mon Sep 17 00:00:00 2001
From: Luca Miccio <lucmiccio@gmail.com>
Date: Mon, 9 Oct 2017 16:27:21 +0200
Subject: block,bfq: Disable writeback throttling

Similarly to CFQ, BFQ has its write-throttling heuristics, and it
is better not to combine them with further write-throttling
heuristics of a different nature.
So this commit disables write-back throttling for a device if BFQ
is used as I/O scheduler for that device.

Signed-off-by: Luca Miccio <lucmiccio@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 3 ++-
 block/blk-wbt.c     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 70f9177c4f5b2..261f98695910f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -108,6 +108,7 @@
 #include "blk-mq-tag.h"
 #include "blk-mq-sched.h"
 #include "bfq-iosched.h"
+#include "blk-wbt.h"
 
 #define BFQ_BFQQ_FNS(name)						\
 void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)			\
@@ -4810,7 +4811,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
 
-
+	wbt_disable_default(q);
 	return 0;
 
 out_free:
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6a9a0f03a67bd..e59d59c11ebbb 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -654,7 +654,7 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
 }
 
 /*
- * Disable wbt, if enabled by default. Only called from CFQ.
+ * Disable wbt, if enabled by default.
  */
 void wbt_disable_default(struct request_queue *q)
 {
-- 
cgit v1.2.3


From 99fead8d38e5302b1be9403d4de815ce9174a3df Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Mon, 9 Oct 2017 13:11:23 +0200
Subject: block, bfq: fix unbalanced decrements of burst size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit "block, bfq: decrease burst size when queues in burst
exit" introduced the decrement of burst_size on the removal of a
bfq_queue from the burst list. Unfortunately, this decrement can
happen to be performed even when burst size is already equal to 0,
because of unbalanced decrements. A description follows of the cause
of these unbalanced decrements, namely a wrong assumption, and of the
way how this wrong assumption leads to unbalanced decrements.

The wrong assumption is that a bfq_queue can exit only if the process
associated with the bfq_queue has exited. This is false, because a
bfq_queue, say Q, may exit also as a consequence of a merge with
another bfq_queue. In this case, Q exits because the I/O of its
associated process has been redirected to another bfq_queue.

The decrement unbalance occurs because Q may then be re-created after
a split, and added back to the current burst list, *without*
incrementing burst_size. burst_size is not incremented because Q is
not a new bfq_queue added to the burst list, but a bfq_queue only
temporarily removed from the list, and, before the commit "bfq-sq,
bfq-mq: decrease burst size when queues in burst exit", burst_size was
not decremented when Q was removed.

This commit addresses this issue by just checking whether the exiting
bfq_queue is a merged bfq_queue, and, in that case, not decrementing
burst_size. Unfortunately, this still leaves room for unbalanced
decrements, in the following rarer case: on a split, the bfq_queue
happens to be inserted into a different burst list than that it was
removed from when merged. If this happens, the number of elements in
the new burst list becomes higher than burst_size (by one). When the
bfq_queue then exits, it is of course not in a merged state any
longer, thus burst_size is decremented, which results in an unbalanced
decrement.  To handle this sporadic, unlucky case in a simple way,
this commit also checks that burst_size is larger than 0 before
decrementing it.

Finally, this commit removes an useless, extra check: the check that
the bfq_queue is sync, performed before checking whether the bfq_queue
is in the burst list. This extra check is redundant, because only sync
bfq_queues can be inserted into the burst list.

Fixes: 7cb04004fa37 ("block, bfq: decrease burst size when queues in burst exit")
Reported-by: Philip Müller <philm@manjaro.org>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Angelo Ruocco <angeloruocco90@gmail.com>
Tested-by: Philip Müller <philm@manjaro.org>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 261f98695910f..889a8549d97f3 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3726,9 +3726,36 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
-	if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) {
+	if (!hlist_unhashed(&bfqq->burst_list_node)) {
 		hlist_del_init(&bfqq->burst_list_node);
-		bfqq->bfqd->burst_size--;
+		/*
+		 * Decrement also burst size after the removal, if the
+		 * process associated with bfqq is exiting, and thus
+		 * does not contribute to the burst any longer. This
+		 * decrement helps filter out false positives of large
+		 * bursts, when some short-lived process (often due to
+		 * the execution of commands by some service) happens
+		 * to start and exit while a complex application is
+		 * starting, and thus spawning several processes that
+		 * do I/O (and that *must not* be treated as a large
+		 * burst, see comments on bfq_handle_burst).
+		 *
+		 * In particular, the decrement is performed only if:
+		 * 1) bfqq is not a merged queue, because, if it is,
+		 * then this free of bfqq is not triggered by the exit
+		 * of the process bfqq is associated with, but exactly
+		 * by the fact that bfqq has just been merged.
+		 * 2) burst_size is greater than 0, to handle
+		 * unbalanced decrements. Unbalanced decrements may
+		 * happen in te following case: bfqq is inserted into
+		 * the current burst list--without incrementing
+		 * bust_size--because of a split, but the current
+		 * burst list is not the burst list bfqq belonged to
+		 * (see comments on the case of a split in
+		 * bfq_set_request).
+		 */
+		if (bfqq->bic && bfqq->bfqd->burst_size > 0)
+			bfqq->bfqd->burst_size--;
 	}
 
 	kmem_cache_free(bfq_pool, bfqq);
@@ -4460,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 		else {
 			bfq_clear_bfqq_in_large_burst(bfqq);
 			if (bic->was_in_burst_list)
+				/*
+				 * If bfqq was in the current
+				 * burst list before being
+				 * merged, then we have to add
+				 * it back. And we do not need
+				 * to increase burst_size, as
+				 * we did not decrement
+				 * burst_size when we removed
+				 * bfqq from the burst list as
+				 * a consequence of a merge
+				 * (see comments in
+				 * bfq_put_queue). In this
+				 * respect, it would be rather
+				 * costly to know whether the
+				 * current burst list is still
+				 * the same burst list from
+				 * which bfqq was removed on
+				 * the merge. To avoid this
+				 * cost, if bfqq was in a
+				 * burst list, then we add
+				 * bfqq to the current burst
+				 * list without any further
+				 * check. This can cause
+				 * inappropriate insertions,
+				 * but rarely enough to not
+				 * harm the detection of large
+				 * bursts significantly.
+				 */
 				hlist_add_head(&bfqq->burst_list_node,
 					       &bfqd->burst_list);
 		}
-- 
cgit v1.2.3


From 8264c3214f28b52b399d9e03bfa7feec275a0d71 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Mon, 9 Oct 2017 13:34:41 +0300
Subject: writeback: merge try_to_writeback_inodes_sb_nr() into caller

Since commit 925a6efb8ff0c ("Btrfs: stop using
try_to_writeback_inodes_sb_nr to flush delalloc") this function hasn't
been used outside so stop exporting it.

In addition we merge it into try_to_writeback_inodes_sb() which is the
only caller.  Also change return type of try_to_writeback_inodes_sb to
void as the only user ext4 doesn't care.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 30 ++++++------------------------
 include/linux/writeback.h |  4 +---
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9e24d604c59c6..08f5debd07d10 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2376,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
- * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
  * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: the reason of writeback
+ * @reason: reason why some writeback work was initiated
  *
- * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
  */
-bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-				   enum wb_reason reason)
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	if (!down_read_trylock(&sb->s_umount))
-		return false;
+		return;
 
-	__writeback_inodes_sb_nr(sb, nr, reason, true);
+	__writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
 	up_read(&sb->s_umount);
-	return true;
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
-
-/**
- * try_to_writeback_inodes_sb - try to start writeback if none underway
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Implement by try_to_writeback_inodes_sb_nr()
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
-	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dd1d2c23f7431..e15ec14085ade 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -163,9 +163,7 @@ struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
-bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-				   enum wb_reason reason);
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(enum wb_reason reason);
 void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
-- 
cgit v1.2.3


From 58a9edce0aa912640abe47d3fc039e6230ef848b Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Tue, 10 Oct 2017 22:53:46 +0800
Subject: blkcg: check pol->cpd_free_fn before free cpd

check pol->cpd_free_fn() instead of pol->cpd_alloc_fn() when free cpd.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56baee9366..e7ec676043b1c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1452,7 +1452,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	return 0;
 
 err_free_cpds:
-	if (pol->cpd_alloc_fn) {
+	if (pol->cpd_free_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			if (blkcg->cpd[pol->plid]) {
 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1492,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
 
-	if (pol->cpd_alloc_fn) {
+	if (pol->cpd_free_fn) {
 		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
 			if (blkcg->cpd[pol->plid]) {
 				pol->cpd_free_fn(blkcg->cpd[pol->plid]);
-- 
cgit v1.2.3


From 53cfdc10a95d03fbc82970d682a32696d19ef886 Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xjf@alibaba-inc.com>
Date: Tue, 10 Oct 2017 11:13:32 +0800
Subject: blk-throttle: fix null pointer dereference while throttling writeback
 IOs

A null pointer dereference can occur when blkcg is removed manually
with writeback IOs inflight. This is caused by the following case:

Writeback kworker submit the bio and set bio->bi_cg_private to tg
in blk_throtl_assoc_bio.
Then we remove the block cgroup manually, the blkg and tg would be
freed if there is no request inflight.
When the submitted bio come back, blk_throtl_bio_endio() fetch the tg
which was already freed.

Fix this by increasing the refcount of blkg in funcion
blk_throtl_assoc_bio() so that the blkg will not be freed until the
bio_endio called.

Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jiufei Xue <jiufei.xjf@alibaba-inc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0fea76aa0f3ff..fe49c465ec860 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2112,8 +2112,12 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	if (bio->bi_css)
+	if (bio->bi_css) {
+		if (bio->bi_cg_private)
+			blkg_put(tg_to_blkg(bio->bi_cg_private));
 		bio->bi_cg_private = tg;
+		blkg_get(tg_to_blkg(tg));
+	}
 	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
 #endif
 }
@@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
 	finish_time = __blk_stat_time(finish_time_ns) >> 10;
-	if (!start_time || finish_time <= start_time)
+	if (!start_time || finish_time <= start_time) {
+		blkg_put(tg_to_blkg(tg));
 		return;
+	}
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
@@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio)
 		tg->bio_cnt /= 2;
 		tg->bad_bio_cnt /= 2;
 	}
+
+	blkg_put(tg_to_blkg(tg));
 }
 #endif
 
-- 
cgit v1.2.3


From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 6 Oct 2017 17:55:59 -0700
Subject: blk-stat: delete useless code

Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
  sum it to global stat. We can do the calculation there. The flush just
  wastes cpu time.
- some fields are signed int/s64. I don't see the point.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-stat.c          | 45 +++++++--------------------------------------
 include/linux/blk_types.h |  5 ++---
 2 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d90fe38..3a2f3c96f3672 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
 #include "blk-mq.h"
 #include "blk.h"
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
 {
 	stat->min = -1ULL;
 	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
-	const s32 nr_batch = READ_ONCE(stat->nr_batch);
-	const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
-	if (!nr_batch)
-		return;
-	if (!nr_samples)
-		stat->mean = div64_s64(stat->batch, nr_batch);
-	else {
-		stat->mean = div64_s64((stat->mean * nr_samples) +
-					stat->batch,
-					nr_batch + nr_samples);
-	}
-
-	stat->nr_samples += nr_batch;
-	stat->nr_batch = stat->batch = 0;
+	stat->batch = 0;
 }
 
+/* src is a per-cpu stat, mean isn't initialized */
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
-	blk_stat_flush_batch(src);
-
 	if (!src->nr_samples)
 		return;
 
 	dst->min = min(dst->min, src->min);
 	dst->max = max(dst->max, src->max);
 
-	if (!dst->nr_samples)
-		dst->mean = src->mean;
-	else {
-		dst->mean = div64_s64((src->mean * src->nr_samples) +
-					(dst->mean * dst->nr_samples),
-					dst->nr_samples + src->nr_samples);
-	}
+	dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+				dst->nr_samples + src->nr_samples);
+
 	dst->nr_samples += src->nr_samples;
 }
 
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
 	stat->min = min(stat->min, value);
 	stat->max = max(stat->max, value);
-
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
-
 	stat->batch += value;
-	stat->nr_batch++;
+	stat->nr_samples++;
 }
 
 void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	s64 now, value;
+	u64 now, value;
 
 	now = __blk_stat_time(ktime_to_ns(ktime_get()));
 	if (now < blk_stat_time(&rq->issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa709cef4..3385c89f402e2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_rq_stat {
-	s64 mean;
+	u64 mean;
 	u64 min;
 	u64 max;
-	s32 nr_samples;
-	s32 nr_batch;
+	u32 nr_samples;
 	u64 batch;
 };
 
-- 
cgit v1.2.3


From 85acb3ba2f925a0ec6928c1967c3adefa00682f4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 6 Oct 2017 17:56:00 -0700
Subject: block: set request_list for request

Legacy queue sets request's request_list, mq doesn't. This makes mq does
the same thing, so we can find cgroup of a request. Note, we really
only use blkg field of request_list, it's pointless to allocate mempool
for request_list in mq case.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 14f7674fa0b14..e8e149ccc86b7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data)
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask)
 {
-	if (unlikely(rl->rq_pool))
+	if (unlikely(rl->rq_pool) || q->mq_ops)
 		return 0;
 
 	rl->q = q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 076cbab9c3e0f..40cba1b1978f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq)
 
 	wbt_done(q->rq_wb, &rq->issue_stat);
 
+	if (blk_rq_rl(rq))
+		blk_put_rl(blk_rq_rl(rq));
+
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
@@ -1532,6 +1535,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	blk_init_request_from_bio(rq, bio);
 
+	blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
+
 	blk_account_io_start(rq, true);
 }
 
-- 
cgit v1.2.3


From 7f66721a7d5bf6251f9c49aada9200c61ddcecc5 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Thu, 12 Oct 2017 19:58:10 +0300
Subject: fs/block_dev: remove vfs_msg() interface

Replaced by pr_err usage in commit ef51042472f5 ("block, dax: move
"select DAX" from BLOCK to FS_DAX")

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c         | 12 ------------
 include/linux/blkdev.h | 11 -----------
 2 files changed, 23 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088ffc05c6..07ddccd178017 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
-void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
-	va_end(args);
-}
-
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fb71fc7d0e85..72637028f3c9f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -917,17 +917,6 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 }
 #endif
 
-#ifdef CONFIG_PRINTK
-#define vfs_msg(sb, level, fmt, ...)				\
-	__vfs_msg(sb, level, fmt, ##__VA_ARGS__)
-#else
-#define vfs_msg(sb, level, fmt, ...)				\
-do {								\
-	no_printk(fmt, ##__VA_ARGS__);				\
-	__vfs_msg(sb, "", " ");					\
-} while (0)
-#endif
-
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
-- 
cgit v1.2.3


From 47bc227deedbcf3ac214a2d922c28dfa5e403f09 Mon Sep 17 00:00:00 2001
From: Christos Gkekas <chris.gekas@gmail.com>
Date: Thu, 12 Oct 2017 23:15:54 +0100
Subject: mtip32xx: Clean up unused variables

Remove variables that are set but never used.

Signed-off-by: Christos Gkekas <chris.gekas@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 4a3cfc7940dec..b8af7352a18f5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
 static bool mtip_pause_ncq(struct mtip_port *port,
 				struct host_to_dev_fis *fis)
 {
-	struct host_to_dev_fis *reply;
 	unsigned long task_file_data;
 
-	reply = port->rxfis + RX_FIS_D2H_REG;
 	task_file_data = readl(port->mmio+PORT_TFDATA);
-
 	if ((task_file_data & 1))
 		return false;
 
@@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		.opts = opts
 	};
 	int rv = 0;
-	unsigned long start;
 
 	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
 	if (buffer & 0x00000007) {
@@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	/* Copy the command to the command table */
 	memcpy(int_cmd->command, fis, fis_len*4);
 
-	start = jiffies;
 	rq->timeout = timeout;
 
 	/* insert request and run queue */
@@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd)
 {
 	int i;
 	int rv;
-	unsigned int num_command_slots;
 	unsigned long timeout, timetaken;
 
 	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
@@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd)
 		rv = -EIO;
 		goto out1;
 	}
-	num_command_slots = dd->slot_groups * 32;
 
 	hba_setup(dd);
 
-- 
cgit v1.2.3


From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:50 +0200
Subject: lightnvm: prevent target type module removal when in use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If target type module e.g. pblk here is unloaded (rmmod) while module
is in use (after creating target) system crashes.  We fix this by
using module API refcnt.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 4 ++++
 drivers/lightnvm/pblk-init.c | 1 +
 include/linux/lightnvm.h     | 1 +
 3 files changed, 6 insertions(+)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ddae430b6eae8..60e163be5a89a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/sem.h>
 #include <linux/bitmap.h>
+#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/miscdevice.h>
 #include <linux/lightnvm.h>
@@ -316,6 +317,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	list_add_tail(&t->list, &dev->targets);
 	mutex_unlock(&dev->mlock);
 
+	__module_get(tt->owner);
+
 	return 0;
 err_sysfs:
 	if (tt->exit)
@@ -351,6 +354,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 
 	nvm_remove_tgt_dev(t->dev, 1);
 	put_disk(tdisk);
+	module_put(t->type->owner);
 
 	list_del(&t->list);
 	kfree(t);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 1b0f61233c216..6df65d14a2c52 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1044,6 +1044,7 @@ static struct nvm_tgt_type tt_pblk = {
 
 	.sysfs_init	= pblk_sysfs_init,
 	.sysfs_exit	= pblk_sysfs_exit,
+	.owner		= THIS_MODULE,
 };
 
 static int __init pblk_module_init(void)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7dfa56ebbc6d0..7b80ac911d26e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -460,6 +460,7 @@ struct nvm_tgt_type {
 
 	/* For internal use */
 	struct list_head list;
+	struct module *owner;
 };
 
 extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
-- 
cgit v1.2.3


From bb6aa6f08268bbce4e0185b18cab9e04505d6695 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:51 +0200
Subject: lightnvm: prevent bd removal if busy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a virtual block device is formatted and mounted after creating
with "nvme lnvm create... -t pblk", a removal from "nvm lnvm remove"
would result in this:

446416.309757] bdi-block not registered
[446416.309773] ------------[ cut here ]------------
[446416.309780] WARNING: CPU: 3 PID: 4319 at fs/fs-writeback.c:2159
  __mark_inode_dirty+0x268/0x340

Ideally removal should return -EBUSY as block device is mounted after
formatting.  This patch tries to address this checking if whole device
or any partition of it already mounted or not before removal.

Whole device is checked using "bd_super" member of block device.  This
member is always set once block device has been mounted using a
filesystem.  Another member "bd_part_count" takes care of checking any
if any partitions are under use.  "bd_part_count" is only updated
under locks when partitions are opened or closed (first open and last
release).  This at least does take care sending -EBUSY if removal is
being attempted while whole block device or any partition is mounted.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 60e163be5a89a..c490711cf0f4a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -373,6 +373,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 {
 	struct nvm_target *t;
+	struct block_device *bdev;
 
 	mutex_lock(&dev->mlock);
 	t = nvm_find_target(dev, remove->tgtname);
@@ -380,6 +381,19 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 		mutex_unlock(&dev->mlock);
 		return 1;
 	}
+	bdev = bdget_disk(t->disk, 0);
+	if (!bdev) {
+		pr_err("nvm: removal failed, allocating bd failed\n");
+		mutex_unlock(&dev->mlock);
+		return -ENOMEM;
+	}
+	if (bdev->bd_super || bdev->bd_part_count) {
+		pr_err("nvm: removal failed, block device busy\n");
+		bdput(bdev);
+		mutex_unlock(&dev->mlock);
+		return -EBUSY;
+	}
+	bdput(bdev);
 	__nvm_remove_target(t);
 	mutex_unlock(&dev->mlock);
 
-- 
cgit v1.2.3


From 88d31ea2676696ad0802a361c8b824f0762fa34c Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:52 +0200
Subject: lightnvm: protect target type list with correct locks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nvm_tgt_types list was protected by wrong lock for NVM_INFO ioctl call
and can race with addition or removal of target types.  Also
unregistering target type was not protected correctly.

Fixes: 5cd907853 ("lightnvm: remove nested lock conflict with mm")
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index c490711cf0f4a..ee2b6d7719901 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -589,9 +589,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt)
 	if (!tt)
 		return;
 
-	down_write(&nvm_lock);
+	down_write(&nvm_tgtt_lock);
 	list_del(&tt->list);
-	up_write(&nvm_lock);
+	up_write(&nvm_tgtt_lock);
 }
 EXPORT_SYMBOL(nvm_unregister_tgt_type);
 
@@ -1195,7 +1195,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
 	info->version[1] = NVM_VERSION_MINOR;
 	info->version[2] = NVM_VERSION_PATCH;
 
-	down_write(&nvm_lock);
+	down_write(&nvm_tgtt_lock);
 	list_for_each_entry(tt, &nvm_tgt_types, list) {
 		struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter];
 
@@ -1208,7 +1208,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
 	}
 
 	info->tgtsize = tgt_iter;
-	up_write(&nvm_lock);
+	up_write(&nvm_tgtt_lock);
 
 	if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) {
 		kfree(info);
-- 
cgit v1.2.3


From a96d50fa0c8d3e399c49a0a90ddbbaabf8a46bb3 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:53 +0200
Subject: lightnvm: remove already calculated nr_chnls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove repeated calculation for number of channels while creating a
target device.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index ee2b6d7719901..798964f511cd0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -139,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	int prev_nr_luns;
 	int i, j;
 
-	nr_chnls = nr_luns / dev->geo.luns_per_chnl;
 	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
 
 	dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
-- 
cgit v1.2.3


From c9d84b350f9b253872fc24f4dcfea166c884ee15 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:54 +0200
Subject: lightnvm: pblk: fix error path in pblk_lines_alloc_metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use appropriate memory free calls based on allocation type used and
also fix number of times free is called if kmalloc fails.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 6df65d14a2c52..05665a7e648ca 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -630,7 +630,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk)
 
 fail_free_emeta:
 	while (--i >= 0) {
-		vfree(l_mg->eline_meta[i]->buf);
+		if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META)
+			vfree(l_mg->eline_meta[i]->buf);
+		else
+			kfree(l_mg->eline_meta[i]->buf);
 		kfree(l_mg->eline_meta[i]);
 	}
 
-- 
cgit v1.2.3


From 32c662c58a9b9d0c99e713a14ca323a9a91c73a0 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:55 +0200
Subject: lightnvm: include NVM Express driver if OCSSD is selected for build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because NVM needs BLK_DEV_NVME, select it automatically if we mark NVM
in config file before building kernel.  Also append PCI to depends as
select doesn't automatically add dependencies.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index ead61a93cb4eb..2a953efec4e19 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,8 @@
 
 menuconfig NVM
 	bool "Open-Channel SSD target support"
-	depends on BLOCK && HAS_DMA
+	depends on BLOCK && HAS_DMA && PCI
+	select BLK_DEV_NVME
 	help
 	  Say Y here to get to enable Open-channel SSDs.
 
-- 
cgit v1.2.3


From e57903fd972a398b7140d0bc055714e13a0e58c5 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:56 +0200
Subject: lightnvm: pblk: protect line bitmap while submitting meta io
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems pblk_dealloc_page would race against pblk_alloc_pages for
line bitmap for sector allocation.The chances are very low but might
as well protect the bitmap properly.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 81501644fb158..b53bb00a99188 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -486,12 +486,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
 	u64 addr;
 	int i;
 
+	spin_lock(&line->lock);
 	addr = find_next_zero_bit(line->map_bitmap,
 					pblk->lm.sec_per_line, line->cur_sec);
 	line->cur_sec = addr - nr_secs;
 
 	for (i = 0; i < nr_secs; i++, line->cur_sec--)
 		WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap));
+	spin_unlock(&line->lock);
 }
 
 u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
-- 
cgit v1.2.3


From 4e76af53e132bff9e2b94f018457fadabf5ab419 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:57 +0200
Subject: lightnvm: pblk: fix message if L2P MAP is in device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This usually happens if we are developing with qemu and ll2pmode has
default value. Improve description.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 05665a7e648ca..8c85779e9635e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -914,7 +914,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	int ret;
 
 	if (dev->identity.dom & NVM_RSP_L2P) {
-		pr_err("pblk: device-side L2P table not supported. (%x)\n",
+		pr_err("pblk: host-side L2P table not supported. (%x)\n",
 							dev->identity.dom);
 		return ERR_PTR(-EINVAL);
 	}
-- 
cgit v1.2.3


From c5493845b7b303315118fb4ab96654bf7cb897f0 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:58 +0200
Subject: lightnvm: pblk: improve error message if down_timeout fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two pr_err messages are useless as they don't differentiate
error code.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index b53bb00a99188..027c42bb1ab99 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1691,16 +1691,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
 #endif
 
 	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
-	if (ret) {
-		switch (ret) {
-		case -ETIME:
-			pr_err("pblk: lun semaphore timed out\n");
-			break;
-		case -EINTR:
-			pr_err("pblk: lun semaphore timed out\n");
-			break;
-		}
-	}
+	if (ret == -ETIME || ret == -EINTR)
+		pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret);
 }
 
 void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
-- 
cgit v1.2.3


From c79819bc0877e4cbed8013b1abc9697e8805b21b Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:45:59 +0200
Subject: lightnvm: pblk: print incompatible line version correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Correct it by converting little endian to cpu endian and also define
a macro for line version so that maintenance is easy.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 2 +-
 drivers/lightnvm/pblk-recovery.c | 4 ++--
 drivers/lightnvm/pblk.h          | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 027c42bb1ab99..8536d38ef97e3 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -978,7 +978,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
 	smeta_buf->header.id = cpu_to_le32(line->id);
 	smeta_buf->header.type = cpu_to_le16(line->type);
-	smeta_buf->header.version = cpu_to_le16(1);
+	smeta_buf->header.version = SMETA_VERSION;
 
 	/* Start metadata */
 	smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index cb556e06673e7..caf1242795757 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -900,9 +900,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
 			continue;
 
-		if (le16_to_cpu(smeta_buf->header.version) != 1) {
+		if (smeta_buf->header.version != SMETA_VERSION) {
 			pr_err("pblk: found incompatible line version %u\n",
-					smeta_buf->header.version);
+					le16_to_cpu(smeta_buf->header.version));
 			return ERR_PTR(-EINVAL);
 		}
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 67e623bd5c2df..9ece409993fec 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -310,6 +310,7 @@ enum {
 };
 
 #define PBLK_MAGIC 0x70626c6b /*pblk*/
+#define SMETA_VERSION cpu_to_le16(1)
 
 struct line_header {
 	__le32 crc;
-- 
cgit v1.2.3


From 32825ebb06fafeff463ed23e9d0dea459ebd30fe Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:00 +0200
Subject: lightnvm: pblk: reuse pblk_gc_should_kick
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a trivial change which reuses pblk_gc_should_kick instead of
repeating it again in pblk_rl_free_lines_inc.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Made it apply to the common case.
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c |  2 --
 drivers/lightnvm/pblk-rl.c   | 33 +++++++++------------------------
 drivers/lightnvm/pblk.h      |  1 -
 3 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 8536d38ef97e3..a68c6ae536e59 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1591,8 +1591,6 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
 
 	spin_unlock(&line->lock);
 	spin_unlock(&l_mg->gc_lock);
-
-	pblk_gc_should_kick(pblk);
 }
 
 void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 2e6a5361baf0e..9565c3bc4d0bb 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
  *
  * Only the total number of free blocks is used to configure the rate limiter.
  */
-static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+static void pblk_rl_update_rates(struct pblk_rl *rl)
 {
+	struct pblk *pblk = container_of(rl, struct pblk, rl);
 	unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+	int max = rl->rb_budget;
 
 	if (free_blocks >= rl->high) {
 		rl->rb_user_max = max;
@@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
 		rl->rb_state = PBLK_RL_LOW;
 	}
 
-	return rl->rb_state;
+	if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
+		pblk_gc_should_start(pblk);
+	else
+		pblk_gc_should_stop(pblk);
 }
 
 void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
 {
-	struct pblk *pblk = container_of(rl, struct pblk, rl);
 	int blk_in_line = atomic_read(&line->blk_in_line);
-	int ret;
 
 	atomic_add(blk_in_line, &rl->free_blocks);
-	/* Rates will not change that often - no need to lock update */
-	ret = pblk_rl_update_rates(rl, rl->rb_budget);
-
-	if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
-		pblk_gc_should_start(pblk);
-	else
-		pblk_gc_should_stop(pblk);
+	pblk_rl_update_rates(rl);
 }
 
 void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
@@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
 	int blk_in_line = atomic_read(&line->blk_in_line);
 
 	atomic_sub(blk_in_line, &rl->free_blocks);
-}
-
-void pblk_gc_should_kick(struct pblk *pblk)
-{
-	struct pblk_rl *rl = &pblk->rl;
-	int ret;
-
-	/* Rates will not change that often - no need to lock update */
-	ret = pblk_rl_update_rates(rl, rl->rb_budget);
-	if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
-		pblk_gc_should_start(pblk);
-	else
-		pblk_gc_should_stop(pblk);
+	pblk_rl_update_rates(rl);
 }
 
 int pblk_rl_high_thrs(struct pblk_rl *rl)
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9ece409993fec..3a07c5b61a0cd 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -824,7 +824,6 @@ int pblk_gc_init(struct pblk *pblk);
 void pblk_gc_exit(struct pblk *pblk);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
-void pblk_gc_should_kick(struct pblk *pblk);
 void pblk_gc_kick(struct pblk *pblk);
 void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
 			      int *gc_active);
-- 
cgit v1.2.3


From a1121176ff757e3c073490a69608ea0b18a00ec1 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:01 +0200
Subject: lightnvm: pblk: initialize debug stat counter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initialize the stat counter for garbage collected reads.

Fixes: a4bd217b43268 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 8c85779e9635e..83445115a922e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -947,6 +947,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	atomic_long_set(&pblk->recov_writes, 0);
 	atomic_long_set(&pblk->recov_writes, 0);
 	atomic_long_set(&pblk->recov_gc_writes, 0);
+	atomic_long_set(&pblk->recov_gc_reads, 0);
 #endif
 
 	atomic_long_set(&pblk->read_failed, 0);
-- 
cgit v1.2.3


From 7d327a9ed6c4dca341ebf99012e0a6b80a3050e6 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:02 +0200
Subject: lightnvm: pblk: use right flag for GC allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The data buffer for the GC path allocates virtual memory through
vmalloc. When this change was introduced, a flag signaling kmalloc'ed
memory was wrongly introduced. Use the right flag when creating a bio
from this buffer.

Fixes: de54e703a422 ("lightnvm: pblk: use vmalloc for GC data buffer")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d682e89e64935..ee8efb55b3307 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -499,7 +499,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 
 	data_len = (*secs_to_gc) * geo->sec_size;
 	bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len,
-						PBLK_KMALLOC_META, GFP_KERNEL);
+						PBLK_VMALLOC_META, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
 		goto err_free_dma;
@@ -519,7 +519,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 	if (ret) {
 		bio_endio(bio);
 		pr_err("pblk: GC read request failed\n");
-		goto err_free_dma;
+		goto err_free_bio;
 	}
 
 	if (!wait_for_completion_io_timeout(&wait,
@@ -541,10 +541,13 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 	atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
 #endif
 
+	bio_put(bio);
 out:
 	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
 	return NVM_IO_OK;
 
+err_free_bio:
+	bio_put(bio);
 err_free_dma:
 	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
 	return NVM_IO_ERR;
-- 
cgit v1.2.3


From cd8ddbf7a5e206fe6995ab0aee245d597dd6a7f2 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:03 +0200
Subject: lightnvm: pblk: free padded entries in write buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a REQ_FLUSH reaches pblk, the bio cannot be directly completed.
Instead, data on the write buffer is flushed and the bio is completed on
the completion pah. This might require some sectors to be padded in
order to guarantee a successful write.

This patch fixes a memory leak on the padded pages. A consequence of
this bad free was that internal bios not containing data (only a flush)
were not being completed.

Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c  | 1 -
 drivers/lightnvm/pblk-write.c | 7 ++++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index a68c6ae536e59..9299a5a75a187 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -190,7 +190,6 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
 
 	WARN_ON(off + nr_pages != bio->bi_vcnt);
 
-	bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
 	for (i = off; i < nr_pages + off; i++) {
 		bv = bio->bi_io_vec[i];
 		mempool_free(bv.bv_page, pblk->page_pool);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3ad9e56d24734..d89ac573f8d83 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -33,6 +33,10 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
 			bio_endio(original_bio);
 	}
 
+	if (c_ctx->nr_padded)
+		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+							c_ctx->nr_padded);
+
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes);
 #endif
@@ -521,7 +525,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
 	struct bio *bio = rqd->bio;
 
 	if (c_ctx->nr_padded)
-		pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded);
+		pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid,
+							c_ctx->nr_padded);
 }
 
 static int pblk_submit_write(struct pblk *pblk)
-- 
cgit v1.2.3


From e0e12a707f02fcde1b77a2417d9fb0ae1ce3b003 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:04 +0200
Subject: lightnvm: pblk: fix write I/O sync stat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix stat counter to collect the right number of I/Os being synced on the
completion path.

Fixes: 0880a9aa2d91f ("lightnvm: pblk: delete redundant buffer pointer")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index d89ac573f8d83..d82ca8bd83908 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
 							c_ctx->nr_padded);
 
 #ifdef CONFIG_NVM_DEBUG
-	atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes);
+	atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
 #endif
 
 	ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
-- 
cgit v1.2.3


From da67e68fb9d37fb9072b20cc75d4337a73bc01b4 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:05 +0200
Subject: lightnvm: pblk: avoid deadlock on low LUN config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On low LUN configurations, make sure not to send bios that are bigger
than the buffer size.

Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 +-
 drivers/lightnvm/pblk-rl.c   | 6 ++++++
 drivers/lightnvm/pblk.h      | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 83445115a922e..eee4eeb47d077 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
 	 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
 	 * available for user I/O.
 	 */
-	if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+	if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl))
 		blk_queue_split(q, &bio);
 
 	return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 9565c3bc4d0bb..0896439a91b0f 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -163,6 +163,11 @@ int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
 	return rl->rb_user_max;
 }
 
+int pblk_rl_max_io(struct pblk_rl *rl)
+{
+	return rl->rb_max_io;
+}
+
 static void pblk_rl_u_timer(unsigned long data)
 {
 	struct pblk_rl *rl = (struct pblk_rl *)data;
@@ -199,6 +204,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	/* To start with, all buffer is available to user I/O writers */
 	rl->rb_budget = budget;
 	rl->rb_user_max = budget;
+	rl->rb_max_io = budget >> 1;
 	rl->rb_gc_max = 0;
 	rl->rb_state = PBLK_RL_HIGH;
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 3a07c5b61a0cd..b592e5194b0f2 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -267,6 +267,7 @@ struct pblk_rl {
 	int rb_gc_max;		/* Max buffer entries available for GC I/O */
 	int rb_gc_rsv;		/* Reserved buffer entries for GC I/O */
 	int rb_state;		/* Rate-limiter current state */
+	int rb_max_io;		/* Maximum size for an I/O giving the config */
 
 	atomic_t rb_user_cnt;	/* User I/O buffer counter */
 	atomic_t rb_gc_cnt;	/* GC I/O buffer counter */
@@ -844,6 +845,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
 int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+int pblk_rl_max_io(struct pblk_rl *rl);
 void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
 void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
 void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left);
-- 
cgit v1.2.3


From bd432417681a224d9fa4a9d43be7d4edc82135b2 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:06 +0200
Subject: lightnvm: pblk: fix min size for page mempool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk uses an internal page mempool for allocating pages on internal
bios. The main two users of this memory pool are partial reads (reads
with some sectors in cache and some on media) and padded writes, which
need to add dummy pages to an existing bio already containing valid
data (and with a large enough bioset allocated). In both cases, the
maximum number of pages per bio is defined by the maximum number of
physical sectors supported by the underlying device.

This patch fixes a bad mempool allocation, where the min_nr of elements
on the pool was fixed (to 16), which is lower than the maximum number
of sectors supported by NVMe (as of the time for this patch). Instead,
use the maximum number of allowed sectors reported by the device.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c |  6 +++---
 drivers/lightnvm/pblk-init.c | 15 ++++++++-------
 drivers/lightnvm/pblk-read.c |  2 +-
 drivers/lightnvm/pblk.h      |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 9299a5a75a187..f5fbb9a46784c 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -192,7 +192,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
 
 	for (i = off; i < nr_pages + off; i++) {
 		bv = bio->bi_io_vec[i];
-		mempool_free(bv.bv_page, pblk->page_pool);
+		mempool_free(bv.bv_page, pblk->page_bio_pool);
 	}
 }
 
@@ -204,14 +204,14 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 	int i, ret;
 
 	for (i = 0; i < nr_pages; i++) {
-		page = mempool_alloc(pblk->page_pool, flags);
+		page = mempool_alloc(pblk->page_bio_pool, flags);
 		if (!page)
 			goto err;
 
 		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
 		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
 			pr_err("pblk: could not add page to bio\n");
-			mempool_free(page, pblk->page_pool);
+			mempool_free(page, pblk->page_bio_pool);
 			goto err;
 		}
 	}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index eee4eeb47d077..7b1f29c713381 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -132,7 +132,6 @@ static int pblk_rwb_init(struct pblk *pblk)
 }
 
 /* Minimum pages needed within a lun */
-#define PAGE_POOL_SIZE 16
 #define ADDR_POOL_SIZE 64
 
 static int pblk_set_ppaf(struct pblk *pblk)
@@ -247,14 +246,16 @@ static int pblk_core_init(struct pblk *pblk)
 	if (pblk_init_global_caches(pblk))
 		return -ENOMEM;
 
-	pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
-	if (!pblk->page_pool)
+	/* internal bios can be at most the sectors signaled by the device. */
+	pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
+									0);
+	if (!pblk->page_bio_pool)
 		return -ENOMEM;
 
 	pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE,
 							pblk_blk_ws_cache);
 	if (!pblk->line_ws_pool)
-		goto free_page_pool;
+		goto free_page_bio_pool;
 
 	pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
 	if (!pblk->rec_pool)
@@ -309,8 +310,8 @@ free_rec_pool:
 	mempool_destroy(pblk->rec_pool);
 free_blk_ws_pool:
 	mempool_destroy(pblk->line_ws_pool);
-free_page_pool:
-	mempool_destroy(pblk->page_pool);
+free_page_bio_pool:
+	mempool_destroy(pblk->page_bio_pool);
 	return -ENOMEM;
 }
 
@@ -322,7 +323,7 @@ static void pblk_core_free(struct pblk *pblk)
 	if (pblk->bb_wq)
 		destroy_workqueue(pblk->bb_wq);
 
-	mempool_destroy(pblk->page_pool);
+	mempool_destroy(pblk->page_bio_pool);
 	mempool_destroy(pblk->line_ws_pool);
 	mempool_destroy(pblk->rec_pool);
 	mempool_destroy(pblk->g_rq_pool);
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index ee8efb55b3307..402c732f0970a 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -238,7 +238,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		kunmap_atomic(src_p);
 		kunmap_atomic(dst_p);
 
-		mempool_free(src_bv.bv_page, pblk->page_pool);
+		mempool_free(src_bv.bv_page, pblk->page_bio_pool);
 
 		hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
 	} while (hole < nr_secs);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index b592e5194b0f2..229f6020ad8ad 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -620,7 +620,7 @@ struct pblk {
 
 	struct list_head compl_list;
 
-	mempool_t *page_pool;
+	mempool_t *page_bio_pool;
 	mempool_t *line_ws_pool;
 	mempool_t *rec_pool;
 	mempool_t *g_rq_pool;
-- 
cgit v1.2.3


From b84ae4a8b883b96b95fff0e3979ff2c65bbf96b0 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:07 +0200
Subject: lightnvm: pblk: simplify work_queue mempool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In pblk, we have a mempool to allocate a generic structure that we
pass along workqueues. This is heavily used in the GC path in order
to have enough inflight reads and fully utilize the GC bandwidth.

However, the current GC path copies data to the host memory and puts it
back into the write buffer. This requires a vmalloc allocation for the
data and a memory copy. Thus, guaranteeing the allocation by using a
mempool for the structure in itself does not give us much. Until we
implement support for vector copy to avoid moving data through the host,
just allocate the workqueue structure using kmalloc.

This allows us to have a much smaller mempool.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c  | 13 +++++++------
 drivers/lightnvm/pblk-gc.c    | 32 ++++++++++++++++----------------
 drivers/lightnvm/pblk-init.c  | 32 ++++++++++++++++----------------
 drivers/lightnvm/pblk-write.c |  4 ++--
 drivers/lightnvm/pblk.h       | 11 ++++++-----
 5 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index f5fbb9a46784c..b92532211866d 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -33,7 +33,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
 		pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
 							line->id, pos);
 
-	pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq);
+	pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
+						GFP_ATOMIC, pblk->bb_wq);
 }
 
 static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
@@ -1623,7 +1624,7 @@ void pblk_line_close_ws(struct work_struct *work)
 	struct pblk_line *line = line_ws->line;
 
 	pblk_line_close(pblk, line);
-	mempool_free(line_ws, pblk->line_ws_pool);
+	mempool_free(line_ws, pblk->gen_ws_pool);
 }
 
 void pblk_line_mark_bb(struct work_struct *work)
@@ -1648,16 +1649,16 @@ void pblk_line_mark_bb(struct work_struct *work)
 	}
 
 	kfree(ppa);
-	mempool_free(line_ws, pblk->line_ws_pool);
+	mempool_free(line_ws, pblk->gen_ws_pool);
 }
 
-void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
-		      void (*work)(struct work_struct *),
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+		      void (*work)(struct work_struct *), gfp_t gfp_mask,
 		      struct workqueue_struct *wq)
 {
 	struct pblk_line_ws *line_ws;
 
-	line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+	line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
 	if (!line_ws)
 		return;
 
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 6090d28f7995a..f163829ecca82 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -136,12 +136,12 @@ static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
 
 static void pblk_gc_line_ws(struct work_struct *work)
 {
-	struct pblk_line_ws *line_rq_ws = container_of(work,
+	struct pblk_line_ws *gc_rq_ws = container_of(work,
 						struct pblk_line_ws, ws);
-	struct pblk *pblk = line_rq_ws->pblk;
+	struct pblk *pblk = gc_rq_ws->pblk;
 	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line = line_rq_ws->line;
-	struct pblk_gc_rq *gc_rq = line_rq_ws->priv;
+	struct pblk_line *line = gc_rq_ws->line;
+	struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
 
 	up(&gc->gc_sem);
 
@@ -151,7 +151,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
 						gc_rq->nr_secs);
 	}
 
-	mempool_free(line_rq_ws, pblk->line_ws_pool);
+	kfree(gc_rq_ws);
 }
 
 static void pblk_gc_line_prepare_ws(struct work_struct *work)
@@ -164,7 +164,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_gc *gc = &pblk->gc;
 	struct line_emeta *emeta_buf;
-	struct pblk_line_ws *line_rq_ws;
+	struct pblk_line_ws *gc_rq_ws;
 	struct pblk_gc_rq *gc_rq;
 	__le64 *lba_list;
 	int sec_left, nr_secs, bit;
@@ -223,19 +223,19 @@ next_rq:
 	gc_rq->nr_secs = nr_secs;
 	gc_rq->line = line;
 
-	line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
-	if (!line_rq_ws)
+	gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
+	if (!gc_rq_ws)
 		goto fail_free_gc_rq;
 
-	line_rq_ws->pblk = pblk;
-	line_rq_ws->line = line;
-	line_rq_ws->priv = gc_rq;
+	gc_rq_ws->pblk = pblk;
+	gc_rq_ws->line = line;
+	gc_rq_ws->priv = gc_rq;
 
 	down(&gc->gc_sem);
 	kref_get(&line->ref);
 
-	INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws);
-	queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws);
+	INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
+	queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws);
 
 	sec_left -= nr_secs;
 	if (sec_left > 0)
@@ -243,7 +243,7 @@ next_rq:
 
 out:
 	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
-	mempool_free(line_ws, pblk->line_ws_pool);
+	kfree(line_ws);
 
 	kref_put(&line->ref, pblk_line_put);
 	atomic_dec(&gc->inflight_gc);
@@ -256,7 +256,7 @@ fail_free_emeta:
 	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
 	pblk_put_line_back(pblk, line);
 	kref_put(&line->ref, pblk_line_put);
-	mempool_free(line_ws, pblk->line_ws_pool);
+	kfree(line_ws);
 	atomic_dec(&gc->inflight_gc);
 
 	pr_err("pblk: Failed to GC line %d\n", line->id);
@@ -269,7 +269,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
 
 	pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
 
-	line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+	line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
 	if (!line_ws)
 		return -ENOMEM;
 
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 7b1f29c713381..3405522535802 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -20,7 +20,7 @@
 
 #include "pblk.h"
 
-static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
+static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
 				*pblk_w_rq_cache, *pblk_line_meta_cache;
 static DECLARE_RWSEM(pblk_lock);
 struct bio_set *pblk_bio_set;
@@ -184,9 +184,9 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	char cache_name[PBLK_CACHE_NAME_LEN];
 
 	down_write(&pblk_lock);
-	pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+	pblk_ws_cache = kmem_cache_create("pblk_blk_ws",
 				sizeof(struct pblk_line_ws), 0, 0, NULL);
-	if (!pblk_blk_ws_cache) {
+	if (!pblk_ws_cache) {
 		up_write(&pblk_lock);
 		return -ENOMEM;
 	}
@@ -194,7 +194,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	pblk_rec_cache = kmem_cache_create("pblk_rec",
 				sizeof(struct pblk_rec_ctx), 0, 0, NULL);
 	if (!pblk_rec_cache) {
-		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_ws_cache);
 		up_write(&pblk_lock);
 		return -ENOMEM;
 	}
@@ -202,7 +202,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size,
 				0, 0, NULL);
 	if (!pblk_g_rq_cache) {
-		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_ws_cache);
 		kmem_cache_destroy(pblk_rec_cache);
 		up_write(&pblk_lock);
 		return -ENOMEM;
@@ -211,7 +211,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
 				0, 0, NULL);
 	if (!pblk_w_rq_cache) {
-		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_ws_cache);
 		kmem_cache_destroy(pblk_rec_cache);
 		kmem_cache_destroy(pblk_g_rq_cache);
 		up_write(&pblk_lock);
@@ -223,7 +223,7 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	pblk_line_meta_cache = kmem_cache_create(cache_name,
 				pblk->lm.sec_bitmap_len, 0, 0, NULL);
 	if (!pblk_line_meta_cache) {
-		kmem_cache_destroy(pblk_blk_ws_cache);
+		kmem_cache_destroy(pblk_ws_cache);
 		kmem_cache_destroy(pblk_rec_cache);
 		kmem_cache_destroy(pblk_g_rq_cache);
 		kmem_cache_destroy(pblk_w_rq_cache);
@@ -246,20 +246,20 @@ static int pblk_core_init(struct pblk *pblk)
 	if (pblk_init_global_caches(pblk))
 		return -ENOMEM;
 
-	/* internal bios can be at most the sectors signaled by the device. */
+	/* Internal bios can be at most the sectors signaled by the device. */
 	pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
 									0);
 	if (!pblk->page_bio_pool)
 		return -ENOMEM;
 
-	pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE,
-							pblk_blk_ws_cache);
-	if (!pblk->line_ws_pool)
+	pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
+							pblk_ws_cache);
+	if (!pblk->gen_ws_pool)
 		goto free_page_bio_pool;
 
 	pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
 	if (!pblk->rec_pool)
-		goto free_blk_ws_pool;
+		goto free_gen_ws_pool;
 
 	pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE,
 							pblk_g_rq_cache);
@@ -308,8 +308,8 @@ free_g_rq_pool:
 	mempool_destroy(pblk->g_rq_pool);
 free_rec_pool:
 	mempool_destroy(pblk->rec_pool);
-free_blk_ws_pool:
-	mempool_destroy(pblk->line_ws_pool);
+free_gen_ws_pool:
+	mempool_destroy(pblk->gen_ws_pool);
 free_page_bio_pool:
 	mempool_destroy(pblk->page_bio_pool);
 	return -ENOMEM;
@@ -324,13 +324,13 @@ static void pblk_core_free(struct pblk *pblk)
 		destroy_workqueue(pblk->bb_wq);
 
 	mempool_destroy(pblk->page_bio_pool);
-	mempool_destroy(pblk->line_ws_pool);
+	mempool_destroy(pblk->gen_ws_pool);
 	mempool_destroy(pblk->rec_pool);
 	mempool_destroy(pblk->g_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
 	mempool_destroy(pblk->line_meta_pool);
 
-	kmem_cache_destroy(pblk_blk_ws_cache);
+	kmem_cache_destroy(pblk_ws_cache);
 	kmem_cache_destroy(pblk_rec_cache);
 	kmem_cache_destroy(pblk_g_rq_cache);
 	kmem_cache_destroy(pblk_w_rq_cache);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index d82ca8bd83908..c73b17bca06b0 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -198,8 +198,8 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 
 	sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
 	if (sync == emeta->nr_entries)
-		pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws,
-								pblk->close_wq);
+		pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
+						GFP_ATOMIC, pblk->close_wq);
 
 	bio_put(rqd->bio);
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 229f6020ad8ad..efaa781abb06c 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -40,7 +40,6 @@
 #define PBLK_MAX_REQ_ADDRS (64)
 #define PBLK_MAX_REQ_ADDRS_PW (6)
 
-#define PBLK_WS_POOL_SIZE (128)
 #define PBLK_META_POOL_SIZE (128)
 #define PBLK_READ_REQ_POOL_SIZE (1024)
 
@@ -61,6 +60,8 @@
 
 #define ERASE 2 /* READ = 0, WRITE = 1 */
 
+#define PBLK_GEN_WS_POOL_SIZE (2)
+
 enum {
 	/* IO Types */
 	PBLK_IOTYPE_USER	= 1 << 0,
@@ -621,7 +622,7 @@ struct pblk {
 	struct list_head compl_list;
 
 	mempool_t *page_bio_pool;
-	mempool_t *line_ws_pool;
+	mempool_t *gen_ws_pool;
 	mempool_t *rec_pool;
 	mempool_t *g_rq_pool;
 	mempool_t *w_rq_pool;
@@ -725,9 +726,9 @@ void pblk_line_close_meta_sync(struct pblk *pblk);
 void pblk_line_close_ws(struct work_struct *work);
 void pblk_pipeline_stop(struct pblk *pblk);
 void pblk_line_mark_bb(struct work_struct *work);
-void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
-		      void (*work)(struct work_struct *),
-		      struct workqueue_struct *wq);
+void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+		     void (*work)(struct work_struct *), gfp_t gfp_mask,
+		     struct workqueue_struct *wq);
 u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
 int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
 int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
-- 
cgit v1.2.3


From 0d880398cb6254ab3e110e2a8a659da65a56ffee Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:08 +0200
Subject: lightnvm: pblk: decouple read/erase mempools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since read and erase paths offer different guarantees for inflight I/Os,
separate the mempools to set the right min_nr for each on creation.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c |  8 ++++----
 drivers/lightnvm/pblk-init.c | 22 +++++++++++++++-------
 drivers/lightnvm/pblk.h      |  5 +++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index b92532211866d..0c22e5ccdfdd9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -64,7 +64,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 	struct pblk *pblk = rqd->private;
 
 	__pblk_end_io_erase(pblk, rqd);
-	mempool_free(rqd, pblk->g_rq_pool);
+	mempool_free(rqd, pblk->e_rq_pool);
 }
 
 void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
@@ -161,7 +161,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
 		pool = pblk->w_rq_pool;
 		rq_size = pblk_w_rq_size;
 	} else {
-		pool = pblk->g_rq_pool;
+		pool = pblk->r_rq_pool;
 		rq_size = pblk_g_rq_size;
 	}
 
@@ -178,7 +178,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
 	if (rw == WRITE)
 		pool = pblk->w_rq_pool;
 	else
-		pool = pblk->g_rq_pool;
+		pool = pblk->r_rq_pool;
 
 	mempool_free(rqd, pool);
 }
@@ -1479,7 +1479,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 	struct nvm_rq *rqd;
 	int err;
 
-	rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL);
+	rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL);
 	memset(rqd, 0, pblk_g_rq_size);
 
 	pblk_setup_e_rq(pblk, rqd, ppa);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 3405522535802..2f8d3f9ffbafc 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -261,15 +261,20 @@ static int pblk_core_init(struct pblk *pblk)
 	if (!pblk->rec_pool)
 		goto free_gen_ws_pool;
 
-	pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE,
+	pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
 							pblk_g_rq_cache);
-	if (!pblk->g_rq_pool)
+	if (!pblk->r_rq_pool)
 		goto free_rec_pool;
 
-	pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2,
+	pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+							pblk_g_rq_cache);
+	if (!pblk->e_rq_pool)
+		goto free_r_rq_pool;
+
+	pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
 							pblk_w_rq_cache);
 	if (!pblk->w_rq_pool)
-		goto free_g_rq_pool;
+		goto free_e_rq_pool;
 
 	pblk->line_meta_pool =
 			mempool_create_slab_pool(PBLK_META_POOL_SIZE,
@@ -304,8 +309,10 @@ free_line_meta_pool:
 	mempool_destroy(pblk->line_meta_pool);
 free_w_rq_pool:
 	mempool_destroy(pblk->w_rq_pool);
-free_g_rq_pool:
-	mempool_destroy(pblk->g_rq_pool);
+free_e_rq_pool:
+	mempool_destroy(pblk->e_rq_pool);
+free_r_rq_pool:
+	mempool_destroy(pblk->r_rq_pool);
 free_rec_pool:
 	mempool_destroy(pblk->rec_pool);
 free_gen_ws_pool:
@@ -326,7 +333,8 @@ static void pblk_core_free(struct pblk *pblk)
 	mempool_destroy(pblk->page_bio_pool);
 	mempool_destroy(pblk->gen_ws_pool);
 	mempool_destroy(pblk->rec_pool);
-	mempool_destroy(pblk->g_rq_pool);
+	mempool_destroy(pblk->r_rq_pool);
+	mempool_destroy(pblk->e_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
 	mempool_destroy(pblk->line_meta_pool);
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index efaa781abb06c..419e1b7328e40 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -41,7 +41,6 @@
 #define PBLK_MAX_REQ_ADDRS_PW (6)
 
 #define PBLK_META_POOL_SIZE (128)
-#define PBLK_READ_REQ_POOL_SIZE (1024)
 
 #define PBLK_NR_CLOSE_JOBS (4)
 
@@ -60,6 +59,7 @@
 
 #define ERASE 2 /* READ = 0, WRITE = 1 */
 
+/* Static pool sizes */
 #define PBLK_GEN_WS_POOL_SIZE (2)
 
 enum {
@@ -624,8 +624,9 @@ struct pblk {
 	mempool_t *page_bio_pool;
 	mempool_t *gen_ws_pool;
 	mempool_t *rec_pool;
-	mempool_t *g_rq_pool;
+	mempool_t *r_rq_pool;
 	mempool_t *w_rq_pool;
+	mempool_t *e_rq_pool;
 	mempool_t *line_meta_pool;
 
 	struct workqueue_struct *close_wq;
-- 
cgit v1.2.3


From e72ec1d31bcb6dffe325418c6d96f2fcab7c2654 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:09 +0200
Subject: lightnvm: pblk: do not use a mempool for line bitmaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk holds two sector bitmaps: one to keep track of the mapped sectors
while the line is active and another one to keep track of the invalid
sectors. The latter is kept during the whole live of the line, until it
is recycled. Since we cannot guarantee forward progress for the mempool
in this case, get rid of the mempool and simply allocate memory through
kmalloc.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 26 ++++++++++----------------
 drivers/lightnvm/pblk-init.c     | 29 ++---------------------------
 drivers/lightnvm/pblk-recovery.c |  2 +-
 drivers/lightnvm/pblk-write.c    |  4 +---
 drivers/lightnvm/pblk.h          |  3 ---
 5 files changed, 14 insertions(+), 50 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 0c22e5ccdfdd9..215aadb84c6e0 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1095,25 +1095,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_line_meta *lm = &pblk->lm;
 	int blk_in_line = atomic_read(&line->blk_in_line);
 
-	line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
 	if (!line->map_bitmap)
 		return -ENOMEM;
-	memset(line->map_bitmap, 0, lm->sec_bitmap_len);
 
-	/* invalid_bitmap is special since it is used when line is closed. No
-	 * need to zeroized; it will be initialized using bb info form
-	 * map_bitmap
-	 */
-	line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+	/* will be initialized using bb info from map_bitmap */
+	line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
 	if (!line->invalid_bitmap) {
-		mempool_free(line->map_bitmap, pblk->line_meta_pool);
+		kfree(line->map_bitmap);
 		return -ENOMEM;
 	}
 
 	spin_lock(&line->lock);
 	if (line->state != PBLK_LINESTATE_FREE) {
-		mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
-		mempool_free(line->map_bitmap, pblk->line_meta_pool);
+		kfree(line->map_bitmap);
+		kfree(line->invalid_bitmap);
 		spin_unlock(&line->lock);
 		WARN(1, "pblk: corrupted line %d, state %d\n",
 							line->id, line->state);
@@ -1165,7 +1161,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 
 void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
 {
-	mempool_free(line->map_bitmap, pblk->line_meta_pool);
+	kfree(line->map_bitmap);
 	line->map_bitmap = NULL;
 	line->smeta = NULL;
 	line->emeta = NULL;
@@ -1440,10 +1436,8 @@ retry_setup:
 
 void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
 {
-	if (line->map_bitmap)
-		mempool_free(line->map_bitmap, pblk->line_meta_pool);
-	if (line->invalid_bitmap)
-		mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+	kfree(line->map_bitmap);
+	kfree(line->invalid_bitmap);
 
 	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
 
@@ -1584,7 +1578,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
 
 	list_add_tail(&line->list, move_list);
 
-	mempool_free(line->map_bitmap, pblk->line_meta_pool);
+	kfree(line->map_bitmap);
 	line->map_bitmap = NULL;
 	line->smeta = NULL;
 	line->emeta = NULL;
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 2f8d3f9ffbafc..4d719782f65b4 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -21,7 +21,7 @@
 #include "pblk.h"
 
 static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
-				*pblk_w_rq_cache, *pblk_line_meta_cache;
+				*pblk_w_rq_cache;
 static DECLARE_RWSEM(pblk_lock);
 struct bio_set *pblk_bio_set;
 
@@ -181,8 +181,6 @@ static int pblk_set_ppaf(struct pblk *pblk)
 
 static int pblk_init_global_caches(struct pblk *pblk)
 {
-	char cache_name[PBLK_CACHE_NAME_LEN];
-
 	down_write(&pblk_lock);
 	pblk_ws_cache = kmem_cache_create("pblk_blk_ws",
 				sizeof(struct pblk_line_ws), 0, 0, NULL);
@@ -217,19 +215,6 @@ static int pblk_init_global_caches(struct pblk *pblk)
 		up_write(&pblk_lock);
 		return -ENOMEM;
 	}
-
-	snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
-							pblk->disk->disk_name);
-	pblk_line_meta_cache = kmem_cache_create(cache_name,
-				pblk->lm.sec_bitmap_len, 0, 0, NULL);
-	if (!pblk_line_meta_cache) {
-		kmem_cache_destroy(pblk_ws_cache);
-		kmem_cache_destroy(pblk_rec_cache);
-		kmem_cache_destroy(pblk_g_rq_cache);
-		kmem_cache_destroy(pblk_w_rq_cache);
-		up_write(&pblk_lock);
-		return -ENOMEM;
-	}
 	up_write(&pblk_lock);
 
 	return 0;
@@ -276,16 +261,10 @@ static int pblk_core_init(struct pblk *pblk)
 	if (!pblk->w_rq_pool)
 		goto free_e_rq_pool;
 
-	pblk->line_meta_pool =
-			mempool_create_slab_pool(PBLK_META_POOL_SIZE,
-							pblk_line_meta_cache);
-	if (!pblk->line_meta_pool)
-		goto free_w_rq_pool;
-
 	pblk->close_wq = alloc_workqueue("pblk-close-wq",
 			WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS);
 	if (!pblk->close_wq)
-		goto free_line_meta_pool;
+		goto free_w_rq_pool;
 
 	pblk->bb_wq = alloc_workqueue("pblk-bb-wq",
 			WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
@@ -305,8 +284,6 @@ free_bb_wq:
 	destroy_workqueue(pblk->bb_wq);
 free_close_wq:
 	destroy_workqueue(pblk->close_wq);
-free_line_meta_pool:
-	mempool_destroy(pblk->line_meta_pool);
 free_w_rq_pool:
 	mempool_destroy(pblk->w_rq_pool);
 free_e_rq_pool:
@@ -336,13 +313,11 @@ static void pblk_core_free(struct pblk *pblk)
 	mempool_destroy(pblk->r_rq_pool);
 	mempool_destroy(pblk->e_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
-	mempool_destroy(pblk->line_meta_pool);
 
 	kmem_cache_destroy(pblk_ws_cache);
 	kmem_cache_destroy(pblk_rec_cache);
 	kmem_cache_destroy(pblk_g_rq_cache);
 	kmem_cache_destroy(pblk_w_rq_cache);
-	kmem_cache_destroy(pblk_line_meta_cache);
 }
 
 static void pblk_luns_free(struct pblk *pblk)
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index caf1242795757..de5270712be77 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -987,7 +987,7 @@ next:
 			list_move_tail(&line->list, move_list);
 			spin_unlock(&l_mg->gc_lock);
 
-			mempool_free(line->map_bitmap, pblk->line_meta_pool);
+			kfree(line->map_bitmap);
 			line->map_bitmap = NULL;
 			line->smeta = NULL;
 			line->emeta = NULL;
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index c73b17bca06b0..26c2b83451497 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -411,8 +411,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	if (emeta->mem >= lm->emeta_len[0]) {
 		spin_lock(&l_mg->close_lock);
 		list_del(&meta_line->list);
-		WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line),
-				"pblk: corrupt meta line %d\n", meta_line->id);
 		spin_unlock(&l_mg->close_lock);
 	}
 
@@ -456,7 +454,7 @@ retry:
 		return 0;
 	}
 	meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
-	if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line))
+	if (meta_line->emeta->mem >= lm->emeta_len[0])
 		goto retry;
 	spin_unlock(&l_mg->close_lock);
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 419e1b7328e40..60edcda0fc7f7 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -40,8 +40,6 @@
 #define PBLK_MAX_REQ_ADDRS (64)
 #define PBLK_MAX_REQ_ADDRS_PW (6)
 
-#define PBLK_META_POOL_SIZE (128)
-
 #define PBLK_NR_CLOSE_JOBS (4)
 
 #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
@@ -627,7 +625,6 @@ struct pblk {
 	mempool_t *r_rq_pool;
 	mempool_t *w_rq_pool;
 	mempool_t *e_rq_pool;
-	mempool_t *line_meta_pool;
 
 	struct workqueue_struct *close_wq;
 	struct workqueue_struct *bb_wq;
-- 
cgit v1.2.3


From 2942f50fa389a62865572452dce6214a8aed69dc Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:10 +0200
Subject: lightnvm: pblk: remove checks on mempool alloc.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As part of the mempool audit on pblk, remove unnecessary mempool
allocation checks on mempools.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     |  4 ----
 drivers/lightnvm/pblk-read.c     |  8 --------
 drivers/lightnvm/pblk-recovery.c | 35 +++++++----------------------------
 drivers/lightnvm/pblk-write.c    | 24 +++++-------------------
 4 files changed, 12 insertions(+), 59 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 215aadb84c6e0..0da58869006b0 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -206,8 +206,6 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 
 	for (i = 0; i < nr_pages; i++) {
 		page = mempool_alloc(pblk->page_bio_pool, flags);
-		if (!page)
-			goto err;
 
 		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
 		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
@@ -1653,8 +1651,6 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 	struct pblk_line_ws *line_ws;
 
 	line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
-	if (!line_ws)
-		return;
 
 	line_ws->pblk = pblk;
 	line_ws->line = line;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 402c732f0970a..d2b6e2a7d7d50 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -168,10 +168,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	DECLARE_COMPLETION_ONSTACK(wait);
 
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
-	if (!new_bio) {
-		pr_err("pblk: could not alloc read bio\n");
-		return NVM_IO_ERR;
-	}
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
 		goto err;
@@ -321,10 +317,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	bitmap_zero(&read_bitmap, nr_secs);
 
 	rqd = pblk_alloc_rqd(pblk, READ);
-	if (IS_ERR(rqd)) {
-		pr_err_ratelimited("pblk: not able to alloc rqd");
-		return NVM_IO_ERR;
-	}
 
 	rqd->opcode = NVM_OP_PREAD;
 	rqd->bio = bio;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index de5270712be77..6b6b4183b41e7 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work)
 								max_secs);
 
 	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
-	if (!bio) {
-		pr_err("pblk: not able to create recovery bio\n");
-		return;
-	}
 
 	bio->bi_iter.bi_sector = 0;
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -85,11 +81,6 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
 
 	rec_rqd = pblk_alloc_rqd(pblk, WRITE);
-	if (IS_ERR(rec_rqd)) {
-		pr_err("pblk: could not create recovery req.\n");
-		return -ENOMEM;
-	}
-
 	rec_ctx = nvm_rq_to_pdu(rec_rqd);
 
 	/* Copy completion bitmap, but exclude the first X completed entries */
@@ -404,22 +395,18 @@ next_pad_rq:
 	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
 	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
 
-	rqd = pblk_alloc_rqd(pblk, WRITE);
-	if (IS_ERR(rqd)) {
-		ret = PTR_ERR(rqd);
-		goto fail_free_meta;
-	}
-
 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
 						PBLK_VMALLOC_META, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		ret = PTR_ERR(bio);
-		goto fail_free_rqd;
+		goto fail_free_meta;
 	}
 
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
+	rqd = pblk_alloc_rqd(pblk, WRITE);
+
 	rqd->bio = bio;
 	rqd->opcode = NVM_OP_PWRITE;
 	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
@@ -490,8 +477,6 @@ free_rq:
 
 fail_free_bio:
 	bio_put(bio);
-fail_free_rqd:
-	pblk_free_rqd(pblk, rqd, WRITE);
 fail_free_meta:
 	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
 fail_free_pad:
@@ -785,15 +770,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	dma_addr_t dma_ppa_list, dma_meta_list;
 	int done, ret = 0;
 
-	rqd = pblk_alloc_rqd(pblk, READ);
-	if (IS_ERR(rqd))
-		return PTR_ERR(rqd);
-
 	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
-	if (!meta_list) {
-		ret = -ENOMEM;
-		goto free_rqd;
-	}
+	if (!meta_list)
+		return -ENOMEM;
 
 	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
 	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
@@ -804,6 +783,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 		goto free_meta_list;
 	}
 
+	rqd = pblk_alloc_rqd(pblk, READ);
+
 	p.ppa_list = ppa_list;
 	p.meta_list = meta_list;
 	p.rqd = rqd;
@@ -832,8 +813,6 @@ out:
 	kfree(data);
 free_meta_list:
 	nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
-free_rqd:
-	pblk_free_rqd(pblk, rqd, READ);
 
 	return ret;
 }
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 26c2b83451497..0fb8f26a6311a 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -111,10 +111,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
 		ppa_list = &rqd->ppa_addr;
 
 	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
-	if (!recovery) {
-		pr_err("pblk: could not allocate recovery context\n");
-		return;
-	}
+
 	INIT_LIST_HEAD(&recovery->failed);
 
 	bit = -1;
@@ -375,10 +372,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	int ret;
 
 	rqd = pblk_alloc_rqd(pblk, READ);
-	if (IS_ERR(rqd)) {
-		pr_err("pblk: cannot allocate write req.\n");
-		return PTR_ERR(rqd);
-	}
+
 	m_ctx = nvm_rq_to_pdu(rqd);
 	m_ctx->private = meta_line;
 
@@ -546,19 +540,12 @@ static int pblk_submit_write(struct pblk *pblk)
 	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
 		return 1;
 
-	rqd = pblk_alloc_rqd(pblk, WRITE);
-	if (IS_ERR(rqd)) {
-		pr_err("pblk: cannot allocate write req.\n");
-		return 1;
-	}
-
 	bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
-	if (!bio) {
-		pr_err("pblk: cannot allocate write bio\n");
-		goto fail_free_rqd;
-	}
+
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+	rqd = pblk_alloc_rqd(pblk, WRITE);
 	rqd->bio = bio;
 
 	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
@@ -589,7 +576,6 @@ fail_free_bio:
 	pblk_free_write_rqd(pblk, rqd);
 fail_put_bio:
 	bio_put(bio);
-fail_free_rqd:
 	pblk_free_rqd(pblk, rqd, WRITE);
 
 	return 1;
-- 
cgit v1.2.3


From 3627896a4b12ea6bb9e0ff77724a24f53726db2d Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:11 +0200
Subject: lightnvm: pblk: use constant for GC max inflight
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use a constant to set the maximum number of inflight GC requests
allowed.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c | 4 ++--
 drivers/lightnvm/pblk.h    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index f163829ecca82..c21b2077432a0 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -93,7 +93,7 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 
 retry:
 	spin_lock(&gc->w_lock);
-	if (gc->w_entries >= PBLK_GC_W_QD) {
+	if (gc->w_entries >= PBLK_GC_RQ_QD) {
 		spin_unlock(&gc->w_lock);
 		pblk_gc_writer_kick(&pblk->gc);
 		usleep_range(128, 256);
@@ -602,7 +602,7 @@ int pblk_gc_init(struct pblk *pblk)
 	spin_lock_init(&gc->w_lock);
 	spin_lock_init(&gc->r_lock);
 
-	sema_init(&gc->gc_sem, 128);
+	sema_init(&gc->gc_sem, PBLK_GC_RQ_QD);
 
 	INIT_LIST_HEAD(&gc->w_list);
 	INIT_LIST_HEAD(&gc->r_list);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 60edcda0fc7f7..baa6a633990fa 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -816,7 +816,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
  * pblk gc
  */
 #define PBLK_GC_MAX_READERS 8	/* Max number of outstanding GC reader jobs */
-#define PBLK_GC_W_QD 128	/* Queue depth for inflight GC write I/Os */
+#define PBLK_GC_RQ_QD 128	/* Queue depth for inflight GC requests */
 #define PBLK_GC_L_QD 4		/* Queue depth for inflight GC lines */
 #define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
-- 
cgit v1.2.3


From 9f6cb13bb40bd9067498e908a3272aba998c0309 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:12 +0200
Subject: lightnvm: pblk: normalize ppa namings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Normalize the way we name ppa variables to improve code readability.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 48 +++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 0da58869006b0..b6d7c66601491 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1746,7 +1746,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 
 void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 {
-	struct ppa_addr l2p_ppa;
+	struct ppa_addr ppa_l2p;
 
 	/* logic error: lba out-of-bounds. Ignore update */
 	if (!(lba < pblk->rl.nr_secs)) {
@@ -1755,10 +1755,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 	}
 
 	spin_lock(&pblk->trans_lock);
-	l2p_ppa = pblk_trans_map_get(pblk, lba);
+	ppa_l2p = pblk_trans_map_get(pblk, lba);
 
-	if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
-		pblk_map_invalidate(pblk, l2p_ppa);
+	if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
+		pblk_map_invalidate(pblk, ppa_l2p);
 
 	pblk_trans_map_set(pblk, lba, ppa);
 	spin_unlock(&pblk->trans_lock);
@@ -1775,16 +1775,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 	pblk_update_map(pblk, lba, ppa);
 }
 
-int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
 		       struct pblk_line *gc_line)
 {
-	struct ppa_addr l2p_ppa;
+	struct ppa_addr ppa_l2p;
 	int ret = 1;
 
 #ifdef CONFIG_NVM_DEBUG
 	/* Callers must ensure that the ppa points to a cache address */
-	BUG_ON(!pblk_addr_in_cache(ppa));
-	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+	BUG_ON(!pblk_addr_in_cache(ppa_new));
+	BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
 #endif
 
 	/* logic error: lba out-of-bounds. Ignore update */
@@ -1794,36 +1794,38 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
 	}
 
 	spin_lock(&pblk->trans_lock);
-	l2p_ppa = pblk_trans_map_get(pblk, lba);
+	ppa_l2p = pblk_trans_map_get(pblk, lba);
 
 	/* Prevent updated entries to be overwritten by GC */
-	if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
-				pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+	if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) ||
+				pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) {
+
 		ret = 0;
 		goto out;
 	}
 
-	pblk_trans_map_set(pblk, lba, ppa);
+	pblk_trans_map_set(pblk, lba, ppa_new);
 out:
 	spin_unlock(&pblk->trans_lock);
 	return ret;
 }
 
-void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
-			 struct ppa_addr entry_line)
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+			 struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
 {
-	struct ppa_addr l2p_line;
+	struct ppa_addr ppa_l2p;
 
 #ifdef CONFIG_NVM_DEBUG
 	/* Callers must ensure that the ppa points to a device address */
-	BUG_ON(pblk_addr_in_cache(ppa));
+	BUG_ON(pblk_addr_in_cache(ppa_mapped));
 #endif
 	/* Invalidate and discard padded entries */
 	if (lba == ADDR_EMPTY) {
 #ifdef CONFIG_NVM_DEBUG
 		atomic_long_inc(&pblk->padded_wb);
 #endif
-		pblk_map_invalidate(pblk, ppa);
+		if (!pblk_ppa_empty(ppa_mapped))
+			pblk_map_invalidate(pblk, ppa_mapped);
 		return;
 	}
 
@@ -1834,22 +1836,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
 	}
 
 	spin_lock(&pblk->trans_lock);
-	l2p_line = pblk_trans_map_get(pblk, lba);
+	ppa_l2p = pblk_trans_map_get(pblk, lba);
 
 	/* Do not update L2P if the cacheline has been updated. In this case,
 	 * the mapped ppa must be invalidated
 	 */
-	if (l2p_line.ppa != entry_line.ppa) {
-		if (!pblk_ppa_empty(ppa))
-			pblk_map_invalidate(pblk, ppa);
+	if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
+		if (!pblk_ppa_empty(ppa_mapped))
+			pblk_map_invalidate(pblk, ppa_mapped);
 		goto out;
 	}
 
 #ifdef CONFIG_NVM_DEBUG
-	WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+	WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
 #endif
 
-	pblk_trans_map_set(pblk, lba, ppa);
+	pblk_trans_map_set(pblk, lba, ppa_mapped);
 out:
 	spin_unlock(&pblk->trans_lock);
 }
-- 
cgit v1.2.3


From 84454e6de56bb5c8629c41ed09aaf5750ff56f5f Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:13 +0200
Subject: lightnvm: pblk: refactor read lba sanity check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor lba sanity check on read path to avoid code duplication.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d2b6e2a7d7d50..eaaf9d55ba97d 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,21 +39,14 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 }
 
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
-				 unsigned long *read_bitmap)
+				 sector_t blba, unsigned long *read_bitmap)
 {
 	struct bio *bio = rqd->bio;
 	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
-	sector_t blba = pblk_get_lba(bio);
 	int nr_secs = rqd->nr_ppas;
 	bool advanced_bio = false;
 	int i, j = 0;
 
-	/* logic error: lba out-of-bounds. Ignore read request */
-	if (blba + nr_secs >= pblk->rl.nr_secs) {
-		WARN(1, "pblk: read lbas out of bounds\n");
-		return;
-	}
-
 	pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
 
 	for (i = 0; i < nr_secs; i++) {
@@ -259,17 +252,10 @@ err:
 }
 
 static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			 unsigned long *read_bitmap)
+			 sector_t lba, unsigned long *read_bitmap)
 {
 	struct bio *bio = rqd->bio;
 	struct ppa_addr ppa;
-	sector_t lba = pblk_get_lba(bio);
-
-	/* logic error: lba out-of-bounds. Ignore read request */
-	if (lba >= pblk->rl.nr_secs) {
-		WARN(1, "pblk: read lba out of bounds\n");
-		return;
-	}
 
 	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
 
@@ -305,14 +291,19 @@ retry:
 int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
+	sector_t blba = pblk_get_lba(bio);
 	unsigned int nr_secs = pblk_get_secs(bio);
 	struct nvm_rq *rqd;
 	unsigned long read_bitmap; /* Max 64 ppas per request */
 	unsigned int bio_init_idx;
 	int ret = NVM_IO_ERR;
 
-	if (nr_secs > PBLK_MAX_REQ_ADDRS)
+	/* logic error: lba out-of-bounds. Ignore read request */
+	if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) {
+		WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n",
+					(unsigned long long)blba, nr_secs);
 		return NVM_IO_ERR;
+	}
 
 	bitmap_zero(&read_bitmap, nr_secs);
 
@@ -340,9 +331,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
 		rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
 
-		pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+		pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
 	} else {
-		pblk_read_rq(pblk, rqd, &read_bitmap);
+		pblk_read_rq(pblk, rqd, blba, &read_bitmap);
 	}
 
 	bio_get(bio);
-- 
cgit v1.2.3


From d340121eb770de3b02bfc73c5f2b00f5345090c2 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:14 +0200
Subject: lightnvm: pblk: simplify data validity check on GC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a line is selected for recycling by the garbage collector (GC), the
line state changes and the invalid bitmap is frozen, preventing
invalidations from happening. Throughout the GC, the L2P map is checked
to verify that not data being recycled has been updated. The last check
is done before the new map is being stored on the L2P table. Though
this algorithm works, it requires a number of corner cases to be checked
each time the L2P table is being updated. This complicates readability
and is error prone in case that the recycling algorithm is modified.

Instead, this patch makes the invalid bitmap accessible even when the
line is being recycled. When recycled data is being remapped, it is
enough to check the invalid bitmap for the line before updating the L2P
table.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-cache.c | 20 +++++------
 drivers/lightnvm/pblk-core.c  | 29 +++++++---------
 drivers/lightnvm/pblk-gc.c    | 58 +++++++++++++++++--------------
 drivers/lightnvm/pblk-rb.c    |  6 ++--
 drivers/lightnvm/pblk-read.c  | 79 +++++++++++++++++++++++--------------------
 drivers/lightnvm/pblk.h       | 23 ++++---------
 6 files changed, 109 insertions(+), 106 deletions(-)

diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 024a8fc93069e..1d6b8e3585f1e 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -73,12 +73,11 @@ out:
  * On GC the incoming lbas are not necessarily sequential. Also, some of the
  * lbas might not be valid entries, which are marked as empty by the GC thread
  */
-int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
-			   unsigned int nr_entries, unsigned int nr_rec_entries,
-			   struct pblk_line *gc_line, unsigned long flags)
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 {
 	struct pblk_w_ctx w_ctx;
 	unsigned int bpos, pos;
+	void *data = gc_rq->data;
 	int i, valid_entries;
 
 	/* Update the write buffer head (mem) with the entries that we can
@@ -86,28 +85,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
 	 * rollback from here on.
 	 */
 retry:
-	if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+	if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) {
 		io_schedule();
 		goto retry;
 	}
 
-	w_ctx.flags = flags;
+	w_ctx.flags = PBLK_IOTYPE_GC;
 	pblk_ppa_set_empty(&w_ctx.ppa);
 
-	for (i = 0, valid_entries = 0; i < nr_entries; i++) {
-		if (lba_list[i] == ADDR_EMPTY)
+	for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
+		if (gc_rq->lba_list[i] == ADDR_EMPTY)
 			continue;
 
-		w_ctx.lba = lba_list[i];
+		w_ctx.lba = gc_rq->lba_list[i];
 
 		pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
-		pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+		pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line,
+						gc_rq->paddr_list[i], pos);
 
 		data += PBLK_EXPOSED_PAGE_SIZE;
 		valid_entries++;
 	}
 
-	WARN_ONCE(nr_rec_entries != valid_entries,
+	WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
 					"pblk: inconsistent GC write\n");
 
 #ifdef CONFIG_NVM_DEBUG
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index b6d7c66601491..6dd4866e579c0 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -78,11 +78,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
 	 * that newer updates are not overwritten.
 	 */
 	spin_lock(&line->lock);
-	if (line->state == PBLK_LINESTATE_GC ||
-					line->state == PBLK_LINESTATE_FREE) {
-		spin_unlock(&line->lock);
-		return;
-	}
+	WARN_ON(line->state == PBLK_LINESTATE_FREE);
 
 	if (test_and_set_bit(paddr, line->invalid_bitmap)) {
 		WARN_ONCE(1, "pblk: double invalidate\n");
@@ -99,8 +95,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
 		spin_lock(&l_mg->gc_lock);
 		spin_lock(&line->lock);
 		/* Prevent moving a line that has just been chosen for GC */
-		if (line->state == PBLK_LINESTATE_GC ||
-					line->state == PBLK_LINESTATE_FREE) {
+		if (line->state == PBLK_LINESTATE_GC) {
 			spin_unlock(&line->lock);
 			spin_unlock(&l_mg->gc_lock);
 			return;
@@ -1766,6 +1761,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 
 void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 {
+
 #ifdef CONFIG_NVM_DEBUG
 	/* Callers must ensure that the ppa points to a cache address */
 	BUG_ON(!pblk_addr_in_cache(ppa));
@@ -1776,9 +1772,9 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 }
 
 int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
-		       struct pblk_line *gc_line)
+		       struct pblk_line *gc_line, u64 paddr_gc)
 {
-	struct ppa_addr ppa_l2p;
+	struct ppa_addr ppa_l2p, ppa_gc;
 	int ret = 1;
 
 #ifdef CONFIG_NVM_DEBUG
@@ -1795,10 +1791,13 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
 
 	spin_lock(&pblk->trans_lock);
 	ppa_l2p = pblk_trans_map_get(pblk, lba);
+	ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id);
 
-	/* Prevent updated entries to be overwritten by GC */
-	if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) ||
-				pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) {
+	if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) {
+		spin_lock(&gc_line->lock);
+		WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap),
+						"pblk: corrupted GC update");
+		spin_unlock(&gc_line->lock);
 
 		ret = 0;
 		goto out;
@@ -1870,15 +1869,13 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
 void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 			  u64 *lba_list, int nr_secs)
 {
-	sector_t lba;
+	u64 lba;
 	int i;
 
 	spin_lock(&pblk->trans_lock);
 	for (i = 0; i < nr_secs; i++) {
 		lba = lba_list[i];
-		if (lba == ADDR_EMPTY) {
-			ppas[i].ppa = ADDR_EMPTY;
-		} else {
+		if (lba != ADDR_EMPTY) {
 			/* logic error: lba out-of-bounds. Ignore update */
 			if (!(lba < pblk->rl.nr_secs)) {
 				WARN(1, "pblk: corrupted L2P map request\n");
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index c21b2077432a0..7ad0cfe58a21d 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -20,7 +20,8 @@
 
 static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
 {
-	vfree(gc_rq->data);
+	if (gc_rq->data)
+		vfree(gc_rq->data);
 	kfree(gc_rq);
 }
 
@@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk)
 	spin_unlock(&gc->w_lock);
 
 	list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
-		pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
-				gc_rq->nr_secs, gc_rq->secs_to_gc,
-				gc_rq->line, PBLK_IOTYPE_GC);
-
+		pblk_write_gc_to_cache(pblk, gc_rq);
 		list_del(&gc_rq->list);
 		kref_put(&gc_rq->line->ref, pblk_line_put);
 		pblk_gc_free_gc_rq(gc_rq);
@@ -69,27 +67,23 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	struct pblk_gc *gc = &pblk->gc;
 	struct pblk_line *line = gc_rq->line;
 	void *data;
-	unsigned int secs_to_gc;
 	int ret = 0;
 
 	data = vmalloc(gc_rq->nr_secs * geo->sec_size);
 	if (!data) {
 		ret = -ENOMEM;
-		goto out;
+		goto fail;
 	}
 
-	/* Read from GC victim block */
-	if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs,
-							&secs_to_gc, line)) {
-		ret = -EFAULT;
-		goto free_data;
-	}
+	gc_rq->data = data;
 
-	if (!secs_to_gc)
-		goto free_rq;
+	/* Read from GC victim block */
+	ret = pblk_submit_read_gc(pblk, gc_rq);
+	if (ret)
+		goto fail;
 
-	gc_rq->data = data;
-	gc_rq->secs_to_gc = secs_to_gc;
+	if (!gc_rq->secs_to_gc)
+		goto fail;
 
 retry:
 	spin_lock(&gc->w_lock);
@@ -107,11 +101,8 @@ retry:
 
 	return 0;
 
-free_rq:
-	kfree(gc_rq);
-free_data:
-	vfree(data);
-out:
+fail:
+	pblk_gc_free_gc_rq(gc_rq);
 	kref_put(&line->ref, pblk_line_put);
 	return ret;
 }
@@ -167,14 +158,21 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 	struct pblk_line_ws *gc_rq_ws;
 	struct pblk_gc_rq *gc_rq;
 	__le64 *lba_list;
+	unsigned long *invalid_bitmap;
 	int sec_left, nr_secs, bit;
 	int ret;
 
+	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!invalid_bitmap) {
+		pr_err("pblk: could not allocate GC invalid bitmap\n");
+		goto fail_free_ws;
+	}
+
 	emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
 								GFP_KERNEL);
 	if (!emeta_buf) {
 		pr_err("pblk: cannot use GC emeta\n");
-		return;
+		goto fail_free_bitmap;
 	}
 
 	ret = pblk_line_read_emeta(pblk, line, emeta_buf);
@@ -193,7 +191,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 		goto fail_free_emeta;
 	}
 
+	spin_lock(&line->lock);
+	bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line);
 	sec_left = pblk_line_vsc(line);
+	spin_unlock(&line->lock);
+
 	if (sec_left < 0) {
 		pr_err("pblk: corrupted GC line (%d)\n", line->id);
 		goto fail_free_emeta;
@@ -207,11 +209,12 @@ next_rq:
 
 	nr_secs = 0;
 	do {
-		bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+		bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line,
 								bit + 1);
 		if (bit > line->emeta_ssec)
 			break;
 
+		gc_rq->paddr_list[nr_secs] = bit;
 		gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]);
 	} while (nr_secs < pblk->max_write_pgs);
 
@@ -244,6 +247,7 @@ next_rq:
 out:
 	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
 	kfree(line_ws);
+	kfree(invalid_bitmap);
 
 	kref_put(&line->ref, pblk_line_put);
 	atomic_dec(&gc->inflight_gc);
@@ -254,9 +258,13 @@ fail_free_gc_rq:
 	kfree(gc_rq);
 fail_free_emeta:
 	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+fail_free_bitmap:
+	kfree(invalid_bitmap);
+fail_free_ws:
+	kfree(line_ws);
+
 	pblk_put_line_back(pblk, line);
 	kref_put(&line->ref, pblk_line_put);
-	kfree(line_ws);
 	atomic_dec(&gc->inflight_gc);
 
 	pr_err("pblk: Failed to GC line %d\n", line->id);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 9bc32578a766e..74c768ce09ef0 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -325,8 +325,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
 }
 
 void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
-			    struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
-			    unsigned int ring_pos)
+			    struct pblk_w_ctx w_ctx, struct pblk_line *line,
+			    u64 paddr, unsigned int ring_pos)
 {
 	struct pblk *pblk = container_of(rb, struct pblk, rwb);
 	struct pblk_rb_entry *entry;
@@ -341,7 +341,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
 
 	__pblk_rb_write_entry(rb, data, w_ctx, entry);
 
-	if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+	if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr))
 		entry->w_ctx.lba = ADDR_EMPTY;
 
 	flags = w_ctx.flags | PBLK_WRITTEN_DATA;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index eaaf9d55ba97d..c28d6509312e2 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -388,34 +388,40 @@ fail_rqd_free:
 
 static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
 			      struct pblk_line *line, u64 *lba_list,
-			      unsigned int nr_secs)
+			      u64 *paddr_list_gc, unsigned int nr_secs)
 {
-	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+	struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS];
+	struct ppa_addr ppa_gc;
 	int valid_secs = 0;
 	int i;
 
-	pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+	pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs);
 
 	for (i = 0; i < nr_secs; i++) {
-		if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
-						pblk_ppa_empty(ppas[i])) {
-			lba_list[i] = ADDR_EMPTY;
+		if (lba_list[i] == ADDR_EMPTY)
+			continue;
+
+		ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id);
+		if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) {
+			paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY;
 			continue;
 		}
 
-		rqd->ppa_list[valid_secs++] = ppas[i];
+		rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
 	}
 
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_add(valid_secs, &pblk->inflight_reads);
 #endif
+
 	return valid_secs;
 }
 
 static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
-		      struct pblk_line *line, sector_t lba)
+		      struct pblk_line *line, sector_t lba,
+		      u64 paddr_gc)
 {
-	struct ppa_addr ppa;
+	struct ppa_addr ppa_l2p, ppa_gc;
 	int valid_secs = 0;
 
 	if (lba == ADDR_EMPTY)
@@ -428,15 +434,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
 	}
 
 	spin_lock(&pblk->trans_lock);
-	ppa = pblk_trans_map_get(pblk, lba);
+	ppa_l2p = pblk_trans_map_get(pblk, lba);
 	spin_unlock(&pblk->trans_lock);
 
-	/* Ignore updated values until the moment */
-	if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
-							pblk_ppa_empty(ppa))
+	ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id);
+	if (!pblk_ppa_comp(ppa_l2p, ppa_gc))
 		goto out;
 
-	rqd->ppa_addr = ppa;
+	rqd->ppa_addr = ppa_l2p;
 	valid_secs = 1;
 
 #ifdef CONFIG_NVM_DEBUG
@@ -447,15 +452,14 @@ out:
 	return valid_secs;
 }
 
-int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
-			unsigned int nr_secs, unsigned int *secs_to_gc,
-			struct pblk_line *line)
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct bio *bio;
 	struct nvm_rq rqd;
-	int ret, data_len;
+	int data_len;
+	int ret = NVM_IO_OK;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
@@ -463,25 +467,29 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 	rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
 							&rqd.dma_meta_list);
 	if (!rqd.meta_list)
-		return NVM_IO_ERR;
+		return -ENOMEM;
 
-	if (nr_secs > 1) {
+	if (gc_rq->nr_secs > 1) {
 		rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
 		rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
 
-		*secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
-								nr_secs);
-		if (*secs_to_gc == 1)
+		gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line,
+							gc_rq->lba_list,
+							gc_rq->paddr_list,
+							gc_rq->nr_secs);
+		if (gc_rq->secs_to_gc == 1)
 			rqd.ppa_addr = rqd.ppa_list[0];
 	} else {
-		*secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+		gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line,
+							gc_rq->lba_list[0],
+							gc_rq->paddr_list[0]);
 	}
 
-	if (!(*secs_to_gc))
+	if (!(gc_rq->secs_to_gc))
 		goto out;
 
-	data_len = (*secs_to_gc) * geo->sec_size;
-	bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len,
+	data_len = (gc_rq->secs_to_gc) * geo->sec_size;
+	bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
 						PBLK_VMALLOC_META, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 		pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
@@ -494,13 +502,12 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 	rqd.opcode = NVM_OP_PREAD;
 	rqd.end_io = pblk_end_io_sync;
 	rqd.private = &wait;
-	rqd.nr_ppas = *secs_to_gc;
+	rqd.nr_ppas = gc_rq->secs_to_gc;
 	rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
 	rqd.bio = bio;
 
-	ret = pblk_submit_read_io(pblk, &rqd);
-	if (ret) {
-		bio_endio(bio);
+	if (pblk_submit_read_io(pblk, &rqd)) {
+		ret = -EIO;
 		pr_err("pblk: GC read request failed\n");
 		goto err_free_bio;
 	}
@@ -519,19 +526,19 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
 	}
 
 #ifdef CONFIG_NVM_DEBUG
-	atomic_long_add(*secs_to_gc, &pblk->sync_reads);
-	atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
-	atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+	atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
+	atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
+	atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
 #endif
 
 	bio_put(bio);
 out:
 	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
-	return NVM_IO_OK;
+	return ret;
 
 err_free_bio:
 	bio_put(bio);
 err_free_dma:
 	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
-	return NVM_IO_ERR;
+	return ret;
 }
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index baa6a633990fa..876b50f97234a 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -206,6 +206,7 @@ struct pblk_lun {
 struct pblk_gc_rq {
 	struct pblk_line *line;
 	void *data;
+	u64 paddr_list[PBLK_MAX_REQ_ADDRS];
 	u64 lba_list[PBLK_MAX_REQ_ADDRS];
 	int nr_secs;
 	int secs_to_gc;
@@ -658,8 +659,8 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
 void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
 			      struct pblk_w_ctx w_ctx, unsigned int pos);
 void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
-			    struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
-			    unsigned int pos);
+			    struct pblk_w_ctx w_ctx, struct pblk_line *line,
+			    u64 paddr, unsigned int pos);
 struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
 void pblk_rb_flush(struct pblk_rb *rb);
 
@@ -761,7 +762,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
 void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
 			 struct ppa_addr ppa, struct ppa_addr entry_line);
 int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
-		       struct pblk_line *gc_line);
+		       struct pblk_line *gc_line, u64 paddr);
 void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 			  u64 *lba_list, int nr_secs);
 void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
@@ -772,9 +773,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
  */
 int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
 			unsigned long flags);
-int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
-			   unsigned int nr_entries, unsigned int nr_rec_entries,
-			   struct pblk_line *gc_line, unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 
 /*
  * pblk map
@@ -798,9 +797,7 @@ void pblk_write_should_kick(struct pblk *pblk);
  */
 extern struct bio_set *pblk_bio_set;
 int pblk_submit_read(struct pblk *pblk, struct bio *bio);
-int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
-			unsigned int nr_secs, unsigned int *secs_to_gc,
-			struct pblk_line *line);
+int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 /*
  * pblk recovery
  */
@@ -893,13 +890,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
 
 static inline int pblk_line_vsc(struct pblk_line *line)
 {
-	int vsc;
-
-	spin_lock(&line->lock);
-	vsc = le32_to_cpu(*line->vsc);
-	spin_unlock(&line->lock);
-
-	return vsc;
+	return le32_to_cpu(*line->vsc);
 }
 
 #define NVM_MEM_PAGE_WRITE (8)
-- 
cgit v1.2.3


From 2a19b10d423c6dc47449e905ed3a8eabb49c48a0 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:15 +0200
Subject: lightnvm: pblk: refactor read path on GC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify the part of the garbage collector where data is read from the
line being recycled and moved into an internal queue before being copied
to the memory buffer. This allows to get rid of a dedicated function,
which introduces an unnecessary dependency on the code.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c | 94 +++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 55 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 7ad0cfe58a21d..7b103bce58bf7 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -56,57 +56,6 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc)
 	wake_up_process(gc->gc_writer_ts);
 }
 
-/*
- * Responsible for managing all memory related to a gc request. Also in case of
- * failure
- */
-static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_gc *gc = &pblk->gc;
-	struct pblk_line *line = gc_rq->line;
-	void *data;
-	int ret = 0;
-
-	data = vmalloc(gc_rq->nr_secs * geo->sec_size);
-	if (!data) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-
-	gc_rq->data = data;
-
-	/* Read from GC victim block */
-	ret = pblk_submit_read_gc(pblk, gc_rq);
-	if (ret)
-		goto fail;
-
-	if (!gc_rq->secs_to_gc)
-		goto fail;
-
-retry:
-	spin_lock(&gc->w_lock);
-	if (gc->w_entries >= PBLK_GC_RQ_QD) {
-		spin_unlock(&gc->w_lock);
-		pblk_gc_writer_kick(&pblk->gc);
-		usleep_range(128, 256);
-		goto retry;
-	}
-	gc->w_entries++;
-	list_add_tail(&gc_rq->list, &gc->w_list);
-	spin_unlock(&gc->w_lock);
-
-	pblk_gc_writer_kick(&pblk->gc);
-
-	return 0;
-
-fail:
-	pblk_gc_free_gc_rq(gc_rq);
-	kref_put(&line->ref, pblk_line_put);
-	return ret;
-}
-
 static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -130,18 +79,53 @@ static void pblk_gc_line_ws(struct work_struct *work)
 	struct pblk_line_ws *gc_rq_ws = container_of(work,
 						struct pblk_line_ws, ws);
 	struct pblk *pblk = gc_rq_ws->pblk;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
 	struct pblk_gc *gc = &pblk->gc;
 	struct pblk_line *line = gc_rq_ws->line;
 	struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
+	int ret;
 
 	up(&gc->gc_sem);
 
-	if (pblk_gc_move_valid_secs(pblk, gc_rq)) {
-		pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n",
-						line->id, *line->vsc,
-						gc_rq->nr_secs);
+	gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size);
+	if (!gc_rq->data) {
+		pr_err("pblk: could not GC line:%d (%d/%d)\n",
+					line->id, *line->vsc, gc_rq->nr_secs);
+		goto out;
+	}
+
+	/* Read from GC victim block */
+	ret = pblk_submit_read_gc(pblk, gc_rq);
+	if (ret) {
+		pr_err("pblk: failed GC read in line:%d (err:%d)\n",
+								line->id, ret);
+		goto out;
 	}
 
+	if (!gc_rq->secs_to_gc)
+		goto out;
+
+retry:
+	spin_lock(&gc->w_lock);
+	if (gc->w_entries >= PBLK_GC_RQ_QD) {
+		spin_unlock(&gc->w_lock);
+		pblk_gc_writer_kick(&pblk->gc);
+		usleep_range(128, 256);
+		goto retry;
+	}
+	gc->w_entries++;
+	list_add_tail(&gc_rq->list, &gc->w_list);
+	spin_unlock(&gc->w_lock);
+
+	pblk_gc_writer_kick(&pblk->gc);
+
+	kfree(gc_rq_ws);
+	return;
+
+out:
+	pblk_gc_free_gc_rq(gc_rq);
+	kref_put(&line->ref, pblk_line_put);
 	kfree(gc_rq_ws);
 }
 
-- 
cgit v1.2.3


From 55e836d401601e7903b36db015ce899dc11085ab Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:16 +0200
Subject: lightnvm: pblk: put bio on bio completion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify put bio by doing it on bio end_io instead of manually putting
it on the completion path.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 10 +++++++---
 drivers/lightnvm/pblk-read.c     |  1 -
 drivers/lightnvm/pblk-recovery.c |  1 -
 drivers/lightnvm/pblk-write.c    |  8 +-------
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6dd4866e579c0..3d27d24baa0b2 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -417,6 +417,11 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
 	return nvm_submit_io(dev, rqd);
 }
 
+static void pblk_bio_map_addr_endio(struct bio *bio)
+{
+	bio_put(bio);
+}
+
 struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 			      unsigned int nr_secs, unsigned int len,
 			      int alloc_type, gfp_t gfp_mask)
@@ -453,6 +458,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 
 		kaddr += PAGE_SIZE;
 	}
+
+	bio->bi_end_io = pblk_bio_map_addr_endio;
 out:
 	return bio;
 }
@@ -671,9 +678,6 @@ next_rq:
 	atomic_dec(&pblk->inflight_io);
 	reinit_completion(&wait);
 
-	if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META))
-		bio_put(bio);
-
 	if (rqd.error) {
 		if (dir == WRITE)
 			pblk_log_write_err(pblk, &rqd);
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index c28d6509312e2..e7141b1aaded4 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -531,7 +531,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
 #endif
 
-	bio_put(bio);
 out:
 	nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
 	return ret;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 6b6b4183b41e7..e59270e60b58e 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -333,7 +333,6 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 
 	pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
 
-	bio_put(rqd->bio);
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	pblk_free_rqd(pblk, rqd, WRITE);
 
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 0fb8f26a6311a..0c0481cf9f5d2 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -188,17 +188,12 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 		pblk_log_write_err(pblk, rqd);
 		pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
 	}
-#ifdef CONFIG_NVM_DEBUG
-	else
-		WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
-#endif
 
 	sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
 	if (sync == emeta->nr_entries)
 		pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
 						GFP_ATOMIC, pblk->close_wq);
 
-	bio_put(rqd->bio);
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	pblk_free_rqd(pblk, rqd, READ);
 
@@ -427,8 +422,7 @@ fail_rollback:
 
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 fail_free_bio:
-	if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META))
-		bio_put(bio);
+	bio_put(bio);
 fail_free_rqd:
 	pblk_free_rqd(pblk, rqd, READ);
 	return ret;
-- 
cgit v1.2.3


From 6ca2f71f3e3d94d188000b420ce0529b07f3ce95 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:17 +0200
Subject: lightnvm: pblk: simplify path on REQ_PREFLUSH
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On REQ_PREFLUSH, directly tag the I/O context flags to signal a flush in
the write to cache path, instead of finding the correct entry context
and imposing a memory barrier. This simplifies the code and might
potentially prevent race conditions when adding functionality to the
write path.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-cache.c | 4 +++-
 drivers/lightnvm/pblk-rb.c    | 8 +-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 1d6b8e3585f1e..0d227ef7d1b95 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -43,8 +43,10 @@ retry:
 	if (unlikely(!bio_has_data(bio)))
 		goto out;
 
-	w_ctx.flags = flags;
 	pblk_ppa_set_empty(&w_ctx.ppa);
+	w_ctx.flags = flags;
+	if (bio->bi_opf & REQ_PREFLUSH)
+		w_ctx.flags |= PBLK_FLUSH_ENTRY;
 
 	for (i = 0; i < nr_entries; i++) {
 		void *data = bio_data(bio);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 74c768ce09ef0..05e6b2e9221df 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -355,7 +355,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
 {
 	struct pblk_rb_entry *entry;
 	unsigned int subm, sync_point;
-	int flags;
 
 	subm = READ_ONCE(rb->subm);
 
@@ -369,12 +368,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
 	sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
 	entry = &rb->entries[sync_point];
 
-	flags = READ_ONCE(entry->w_ctx.flags);
-	flags |= PBLK_FLUSH_ENTRY;
-
-	/* Release flags on context. Protect from writes */
-	smp_store_release(&entry->w_ctx.flags, flags);
-
 	/* Protect syncs */
 	smp_store_release(&rb->sync_point, sync_point);
 
@@ -454,6 +447,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
 
 	/* Protect from read count */
 	smp_store_release(&rb->mem, mem);
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 875d94f3a4838f2243334e5ce55ac8153f9bbf5b Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:18 +0200
Subject: lightnvm: pblk: allocate bio size more accurately
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wait until we know the exact number of ppas to be sent to the device,
before allocating the bio.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-rb.c    |  5 +++--
 drivers/lightnvm/pblk-write.c | 20 ++++++++++----------
 drivers/lightnvm/pblk.h       |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 05e6b2e9221df..1173e2380137d 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -552,12 +552,13 @@ out:
  * persist data on the write buffer to the media.
  */
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-				 struct bio *bio, unsigned int pos,
-				 unsigned int nr_entries, unsigned int count)
+				 unsigned int pos, unsigned int nr_entries,
+				 unsigned int count)
 {
 	struct pblk *pblk = container_of(rb, struct pblk, rwb);
 	struct request_queue *q = pblk->dev->q;
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+	struct bio *bio = rqd->bio;
 	struct pblk_rb_entry *entry;
 	struct page *page;
 	unsigned int pad = 0, to_read = nr_entries;
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 0c0481cf9f5d2..140a26edd1d3d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -534,24 +534,24 @@ static int pblk_submit_write(struct pblk *pblk)
 	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
 		return 1;
 
-	bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
-
-	bio->bi_iter.bi_sector = 0; /* internal bio */
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-	rqd = pblk_alloc_rqd(pblk, WRITE);
-	rqd->bio = bio;
-
 	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
 	if (secs_to_sync > pblk->max_write_pgs) {
 		pr_err("pblk: bad buffer sync calculation\n");
-		goto fail_put_bio;
+		return 1;
 	}
 
 	secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
 	pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
 
-	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync,
+	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+
+	bio->bi_iter.bi_sector = 0; /* internal bio */
+	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+	rqd = pblk_alloc_rqd(pblk, WRITE);
+	rqd->bio = bio;
+
+	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
 								secs_avail)) {
 		pr_err("pblk: corrupted write bio\n");
 		goto fail_put_bio;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 876b50f97234a..9f162057d4971 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -666,8 +666,8 @@ void pblk_rb_flush(struct pblk_rb *rb);
 
 void pblk_rb_sync_l2p(struct pblk_rb *rb);
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-				 struct bio *bio, unsigned int pos,
-				 unsigned int nr_entries, unsigned int count);
+				 unsigned int pos, unsigned int nr_entries,
+				 unsigned int count);
 unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
 				      struct list_head *list,
 				      unsigned int max);
-- 
cgit v1.2.3


From e2cddf2082e700218b898b1c899f6a1c2130074a Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:19 +0200
Subject: lightnvm: pblk: improve naming for internal req.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each request type sent to the LightNVM subsystem requires different
metadata. Until now, we have tailored this metadata based on write, read
and erase commands. However, pblk uses different metadata for internal
writes that do not hit the write buffer. Instead of abusing the metadata
for reads, create a new request type - internal write to improve
code readability.

In the process, create internal values for each I/O type instead of
abusing the READ/WRITE macros, as suggested by Christoph.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 32 ++++++++++++++++----------------
 drivers/lightnvm/pblk-read.c     |  6 +++---
 drivers/lightnvm/pblk-recovery.c | 12 ++++++------
 drivers/lightnvm/pblk-write.c    | 16 ++++++++--------
 drivers/lightnvm/pblk.h          | 11 ++++++++---
 5 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 3d27d24baa0b2..a492964abea84 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -152,7 +152,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
 	struct nvm_rq *rqd;
 	int rq_size;
 
-	if (rw == WRITE) {
+	if (rw == PBLK_WRITE) {
 		pool = pblk->w_rq_pool;
 		rq_size = pblk_w_rq_size;
 	} else {
@@ -170,7 +170,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
 {
 	mempool_t *pool;
 
-	if (rw == WRITE)
+	if (rw == PBLK_WRITE)
 		pool = pblk->w_rq_pool;
 	else
 		pool = pblk->r_rq_pool;
@@ -569,10 +569,10 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
-	if (dir == WRITE) {
+	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
 		cmd_op = NVM_OP_PWRITE;
-	} else if (dir == READ) {
+	} else if (dir == PBLK_READ) {
 		bio_op = REQ_OP_READ;
 		cmd_op = NVM_OP_PREAD;
 	} else
@@ -612,10 +612,10 @@ next_rq:
 	rqd.end_io = pblk_end_io_sync;
 	rqd.private = &wait;
 
-	if (dir == WRITE) {
+	if (dir == PBLK_WRITE) {
 		struct pblk_sec_meta *meta_list = rqd.meta_list;
 
-		rqd.flags = pblk_set_progr_mode(pblk, WRITE);
+		rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
 		for (i = 0; i < rqd.nr_ppas; ) {
 			spin_lock(&line->lock);
 			paddr = __pblk_alloc_page(pblk, line, min);
@@ -679,7 +679,7 @@ next_rq:
 	reinit_completion(&wait);
 
 	if (rqd.error) {
-		if (dir == WRITE)
+		if (dir == PBLK_WRITE)
 			pblk_log_write_err(pblk, &rqd);
 		else
 			pblk_log_read_err(pblk, &rqd);
@@ -722,12 +722,12 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	int flags;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
-	if (dir == WRITE) {
+	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
 		cmd_op = NVM_OP_PWRITE;
-		flags = pblk_set_progr_mode(pblk, WRITE);
+		flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
 		lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-	} else if (dir == READ) {
+	} else if (dir == PBLK_READ) {
 		bio_op = REQ_OP_READ;
 		cmd_op = NVM_OP_PREAD;
 		flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -765,7 +765,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 
 		rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
 
-		if (dir == WRITE) {
+		if (dir == PBLK_WRITE) {
 			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
 			meta_list[i].lba = lba_list[paddr] = addr_empty;
@@ -791,7 +791,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
-		if (dir == WRITE)
+		if (dir == PBLK_WRITE)
 			pblk_log_write_err(pblk, &rqd);
 		else
 			pblk_log_read_err(pblk, &rqd);
@@ -807,14 +807,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
 {
 	u64 bpaddr = pblk_line_smeta_start(pblk, line);
 
-	return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+	return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
 }
 
 int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
 			 void *emeta_buf)
 {
 	return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
-						line->emeta_ssec, READ);
+						line->emeta_ssec, PBLK_READ);
 }
 
 static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
@@ -823,7 +823,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->opcode = NVM_OP_ERASE;
 	rqd->ppa_addr = ppa;
 	rqd->nr_ppas = 1;
-	rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+	rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE);
 	rqd->bio = NULL;
 }
 
@@ -1045,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 	line->smeta_ssec = off;
 	line->cur_sec = off + lm->smeta_sec;
 
-	if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+	if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
 		pr_debug("pblk: line smeta I/O failed. Retry\n");
 		return 1;
 	}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index e7141b1aaded4..4b1722fbe5a0c 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -142,7 +142,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 	atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
 #endif
 
-	pblk_free_rqd(pblk, rqd, READ);
+	pblk_free_rqd(pblk, rqd, PBLK_READ);
 	atomic_dec(&pblk->inflight_io);
 }
 
@@ -307,7 +307,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 
 	bitmap_zero(&read_bitmap, nr_secs);
 
-	rqd = pblk_alloc_rqd(pblk, READ);
+	rqd = pblk_alloc_rqd(pblk, PBLK_READ);
 
 	rqd->opcode = NVM_OP_PREAD;
 	rqd->bio = bio;
@@ -382,7 +382,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	return NVM_IO_OK;
 
 fail_rqd_free:
-	pblk_free_rqd(pblk, rqd, READ);
+	pblk_free_rqd(pblk, rqd, PBLK_READ);
 	return ret;
 }
 
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index e59270e60b58e..19f2fb1a9e4b2 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -67,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work)
 
 err:
 	bio_put(bio);
-	pblk_free_rqd(pblk, rqd, WRITE);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 }
 
 int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
@@ -80,7 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 	struct pblk_c_ctx *rec_ctx;
 	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
 
-	rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+	rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
 	rec_ctx = nvm_rq_to_pdu(rec_rqd);
 
 	/* Copy completion bitmap, but exclude the first X completed entries */
@@ -334,7 +334,7 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 	pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
 
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-	pblk_free_rqd(pblk, rqd, WRITE);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 
 	atomic_dec(&pblk->inflight_io);
 	kref_put(&pad_rq->ref, pblk_recov_complete);
@@ -404,11 +404,11 @@ next_pad_rq:
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
-	rqd = pblk_alloc_rqd(pblk, WRITE);
+	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
 
 	rqd->bio = bio;
 	rqd->opcode = NVM_OP_PWRITE;
-	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+	rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
 	rqd->meta_list = meta_list;
 	rqd->nr_ppas = rq_ppas;
 	rqd->ppa_list = ppa_list;
@@ -782,7 +782,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 		goto free_meta_list;
 	}
 
-	rqd = pblk_alloc_rqd(pblk, READ);
+	rqd = pblk_alloc_rqd(pblk, PBLK_READ);
 
 	p.ppa_list = ppa_list;
 	p.meta_list = meta_list;
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 140a26edd1d3d..c1b8b83e149d9 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -46,7 +46,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 
 	bio_put(rqd->bio);
-	pblk_free_rqd(pblk, rqd, WRITE);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 
 	return ret;
 }
@@ -195,7 +195,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 						GFP_ATOMIC, pblk->close_wq);
 
 	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-	pblk_free_rqd(pblk, rqd, READ);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 
 	atomic_dec(&pblk->inflight_io);
 }
@@ -209,7 +209,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	/* Setup write request */
 	rqd->opcode = NVM_OP_PWRITE;
 	rqd->nr_ppas = nr_secs;
-	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+	rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
 	rqd->private = pblk;
 	rqd->end_io = end_io;
 
@@ -275,7 +275,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
 
 	rqd->ppa_status = (u64)0;
-	rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+	rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
 
 	return ret;
 }
@@ -366,7 +366,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	int i, j;
 	int ret;
 
-	rqd = pblk_alloc_rqd(pblk, READ);
+	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT);
 
 	m_ctx = nvm_rq_to_pdu(rqd);
 	m_ctx->private = meta_line;
@@ -424,7 +424,7 @@ fail_rollback:
 fail_free_bio:
 	bio_put(bio);
 fail_free_rqd:
-	pblk_free_rqd(pblk, rqd, READ);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 	return ret;
 }
 
@@ -548,7 +548,7 @@ static int pblk_submit_write(struct pblk *pblk)
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
-	rqd = pblk_alloc_rqd(pblk, WRITE);
+	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
 	rqd->bio = bio;
 
 	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
@@ -570,7 +570,7 @@ fail_free_bio:
 	pblk_free_write_rqd(pblk, rqd);
 fail_put_bio:
 	bio_put(bio);
-	pblk_free_rqd(pblk, rqd, WRITE);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 
 	return 1;
 }
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9f162057d4971..d01e003d3d742 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -55,11 +55,16 @@
 		for ((i) = 0, rlun = &(pblk)->luns[0]; \
 			(i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
 
-#define ERASE 2 /* READ = 0, WRITE = 1 */
-
 /* Static pool sizes */
 #define PBLK_GEN_WS_POOL_SIZE (2)
 
+enum {
+	PBLK_READ		= READ,
+	PBLK_WRITE		= WRITE,/* Write from write buffer */
+	PBLK_WRITE_INT,			/* Internal write - no write buffer */
+	PBLK_ERASE,
+};
+
 enum {
 	/* IO Types */
 	PBLK_IOTYPE_USER	= 1 << 0,
@@ -1132,7 +1137,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
 
 	flags = geo->plane_mode >> 1;
 
-	if (type == WRITE)
+	if (type == PBLK_WRITE)
 		flags |= NVM_IO_SCRAMBLE_ENABLE;
 
 	return flags;
-- 
cgit v1.2.3


From 67bf26a3220e3bd403a62a9289aa1d065d3db82c Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:20 +0200
Subject: lightnvm: pblk: refactor rqd alloc/free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor the rqd allocation and free functions so that all I/O types can
use these helper functions.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 40 ++++++++++++++++++++++++++++++----------
 drivers/lightnvm/pblk-read.c     |  2 --
 drivers/lightnvm/pblk-recovery.c |  2 --
 drivers/lightnvm/pblk-write.c    |  7 -------
 drivers/lightnvm/pblk.h          |  4 ++--
 5 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index a492964abea84..0de3875211b17 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -146,18 +146,26 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
 	spin_unlock(&pblk->trans_lock);
 }
 
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+/* Caller must guarantee that the request is a valid type */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
 {
 	mempool_t *pool;
 	struct nvm_rq *rqd;
 	int rq_size;
 
-	if (rw == PBLK_WRITE) {
+	switch (type) {
+	case PBLK_WRITE:
+	case PBLK_WRITE_INT:
 		pool = pblk->w_rq_pool;
 		rq_size = pblk_w_rq_size;
-	} else {
+		break;
+	case PBLK_READ:
 		pool = pblk->r_rq_pool;
 		rq_size = pblk_g_rq_size;
+		break;
+	default:
+		pool = pblk->e_rq_pool;
+		rq_size = pblk_g_rq_size;
 	}
 
 	rqd = mempool_alloc(pool, GFP_KERNEL);
@@ -166,15 +174,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
 	return rqd;
 }
 
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+/* Typically used on completion path. Cannot guarantee request consistency */
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
 	mempool_t *pool;
 
-	if (rw == PBLK_WRITE)
+	switch (type) {
+	case PBLK_WRITE:
+		kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+	case PBLK_WRITE_INT:
 		pool = pblk->w_rq_pool;
-	else
+		break;
+	case PBLK_READ:
 		pool = pblk->r_rq_pool;
+		break;
+	case PBLK_ERASE:
+		pool = pblk->e_rq_pool;
+		break;
+	default:
+		pr_err("pblk: trying to free unknown rqd type\n");
+		return;
+	}
 
+	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	mempool_free(rqd, pool);
 }
 
@@ -1470,8 +1493,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 	struct nvm_rq *rqd;
 	int err;
 
-	rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL);
-	memset(rqd, 0, pblk_g_rq_size);
+	rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
 
 	pblk_setup_e_rq(pblk, rqd, ppa);
 
@@ -1739,8 +1761,6 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		rlun = &pblk->luns[bit];
 		up(&rlun->wr_sem);
 	}
-
-	kfree(lun_bitmap);
 }
 
 void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 4b1722fbe5a0c..d7c90c303540f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -124,8 +124,6 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 		WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
 #endif
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-
 	bio_put(bio);
 	if (r_ctx->private) {
 		struct bio *orig_bio = r_ctx->private;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 19f2fb1a9e4b2..686bc17f080ff 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -329,11 +329,9 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 {
 	struct pblk_pad_rq *pad_rq = rqd->private;
 	struct pblk *pblk = pad_rq->pblk;
-	struct nvm_tgt_dev *dev = pblk->dev;
 
 	pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 
 	atomic_dec(&pblk->inflight_io);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index c1b8b83e149d9..f2e846fe9242a 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -20,7 +20,6 @@
 static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
 				    struct pblk_c_ctx *c_ctx)
 {
-	struct nvm_tgt_dev *dev = pblk->dev;
 	struct bio *original_bio;
 	unsigned long ret;
 	int i;
@@ -43,8 +42,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
 
 	ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-
 	bio_put(rqd->bio);
 	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 
@@ -176,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
 static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 {
 	struct pblk *pblk = rqd->private;
-	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_line *line = m_ctx->private;
 	struct pblk_emeta *emeta = line->emeta;
@@ -194,7 +190,6 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 		pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
 						GFP_ATOMIC, pblk->close_wq);
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 	pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 
 	atomic_dec(&pblk->inflight_io);
@@ -419,8 +414,6 @@ fail_rollback:
 	pblk_dealloc_page(pblk, meta_line, rq_ppas);
 	list_add(&meta_line->list, &meta_line->list);
 	spin_unlock(&l_mg->close_lock);
-
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 fail_free_bio:
 	bio_put(bio);
 fail_free_rqd:
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index d01e003d3d742..12a20f800c261 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -699,11 +699,11 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
 /*
  * pblk core
  */
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
 void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
 int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
 			struct pblk_c_ctx *c_ctx);
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
 void pblk_wait_for_meta(struct pblk *pblk);
 struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
 void pblk_discard(struct pblk *pblk, struct bio *bio);
-- 
cgit v1.2.3


From 26532ee52b77185b095d29b54c83386f737a74ba Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:21 +0200
Subject: lightnvm: pblk: use rqd->end_io for completion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For consistency with the rest of pblk, use rqd->end_io to point to the
function taking care of ending the request on the completion path.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 7 -------
 drivers/lightnvm/pblk-read.c | 5 ++---
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 0de3875211b17..08d166ac4f3c0 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -261,13 +261,6 @@ void pblk_write_should_kick(struct pblk *pblk)
 		pblk_write_kick(pblk);
 }
 
-void pblk_end_bio_sync(struct bio *bio)
-{
-	struct completion *waiting = bio->bi_private;
-
-	complete(waiting);
-}
-
 void pblk_end_io_sync(struct nvm_rq *rqd)
 {
 	struct completion *waiting = rqd->private;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d7c90c303540f..0299fc08291d7 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -170,13 +170,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
-	new_bio->bi_private = &wait;
-	new_bio->bi_end_io = pblk_end_bio_sync;
 
 	rqd->bio = new_bio;
 	rqd->nr_ppas = nr_holes;
 	rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-	rqd->end_io = NULL;
+	rqd->end_io = pblk_end_io_sync;
+	rqd->private = &wait;
 
 	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
 		ppa_ptr = rqd->ppa_list;
-- 
cgit v1.2.3


From a4809fee4e774fdf3296cc69c22ce6e6acef36b2 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:22 +0200
Subject: lightnvm: pblk: check lba sanity on read path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As part of pblk's recovery scheme, we store the lba mapped to each
physical sector on the device's out-of-bound (OOB) area.

On the read path, we can use this information to validate that the data
being delivered to the upper layers corresponds to the lba being
requested. The cost of this check is an extra copy on the DMA region on
the device and an extra comparison in the host, given that (i) the OOB
area is being read together with the data in the media, and (ii) the DMA
region allocated for the ppa list can be reused for the metadata stored
on the OOB area.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 51 +++++++++++++++++++++++++++++++++++++++++---
 drivers/lightnvm/pblk.h      |  4 +++-
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 0299fc08291d7..a465d9980df4c 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -41,6 +41,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 				 sector_t blba, unsigned long *read_bitmap)
 {
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
 	struct bio *bio = rqd->bio;
 	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
 	int nr_secs = rqd->nr_ppas;
@@ -56,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 retry:
 		if (pblk_ppa_empty(p)) {
 			WARN_ON(test_and_set_bit(i, read_bitmap));
+			meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
 
 			if (unlikely(!advanced_bio)) {
 				bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -75,6 +77,7 @@ retry:
 				goto retry;
 			}
 			WARN_ON(test_and_set_bit(i, read_bitmap));
+			meta_list[i].lba = cpu_to_le64(lba);
 			advanced_bio = true;
 #ifdef CONFIG_NVM_DEBUG
 			atomic_long_inc(&pblk->cache_reads);
@@ -110,10 +113,26 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
 	return NVM_IO_OK;
 }
 
+static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
+			   sector_t blba)
+{
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	int nr_lbas = rqd->nr_ppas;
+	int i;
+
+	for (i = 0; i < nr_lbas; i++) {
+		u64 lba = le64_to_cpu(meta_list[i].lba);
+
+		if (lba == ADDR_EMPTY)
+			continue;
+
+		WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+	}
+}
+
 static void pblk_end_io_read(struct nvm_rq *rqd)
 {
 	struct pblk *pblk = rqd->private;
-	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct bio *bio = rqd->bio;
 
@@ -124,6 +143,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 		WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
 #endif
 
+	pblk_read_check(pblk, rqd, r_ctx->lba);
+
 	bio_put(bio);
 	if (r_ctx->private) {
 		struct bio *orig_bio = r_ctx->private;
@@ -149,15 +170,21 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 				      unsigned long *read_bitmap)
 {
 	struct bio *new_bio, *bio = rqd->bio;
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
 	struct bio_vec src_bv, dst_bv;
 	void *ppa_ptr = NULL;
 	void *src_p, *dst_p;
 	dma_addr_t dma_ppa_list = 0;
+	__le64 *lba_list_mem, *lba_list_media;
 	int nr_secs = rqd->nr_ppas;
 	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
 	int i, ret, hole;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
+	/* Re-use allocated memory for intermediate lbas */
+	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+	lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
+
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -168,6 +195,9 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		goto err;
 	}
 
+	for (i = 0; i < nr_secs; i++)
+		lba_list_mem[i] = meta_list[i].lba;
+
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
 
@@ -207,10 +237,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		rqd->dma_ppa_list = dma_ppa_list;
 	}
 
+	for (i = 0; i < nr_secs; i++) {
+		lba_list_media[i] = meta_list[i].lba;
+		meta_list[i].lba = lba_list_mem[i];
+	}
+
 	/* Fill the holes in the original bio */
 	i = 0;
 	hole = find_first_zero_bit(read_bitmap, nr_secs);
 	do {
+		meta_list[hole].lba = lba_list_media[i];
+
 		src_bv = new_bio->bi_io_vec[i++];
 		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
 
@@ -251,6 +288,7 @@ err:
 static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
 			 sector_t lba, unsigned long *read_bitmap)
 {
+	struct pblk_sec_meta *meta_list = rqd->meta_list;
 	struct bio *bio = rqd->bio;
 	struct ppa_addr ppa;
 
@@ -263,6 +301,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
 retry:
 	if (pblk_ppa_empty(ppa)) {
 		WARN_ON(test_and_set_bit(0, read_bitmap));
+		meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
 		return;
 	}
 
@@ -274,6 +313,9 @@ retry:
 			pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
 			goto retry;
 		}
+
+		meta_list[0].lba = cpu_to_le64(lba);
+
 		WARN_ON(test_and_set_bit(0, read_bitmap));
 #ifdef CONFIG_NVM_DEBUG
 			atomic_long_inc(&pblk->cache_reads);
@@ -290,9 +332,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	struct nvm_tgt_dev *dev = pblk->dev;
 	sector_t blba = pblk_get_lba(bio);
 	unsigned int nr_secs = pblk_get_secs(bio);
+	struct pblk_g_ctx *r_ctx;
 	struct nvm_rq *rqd;
-	unsigned long read_bitmap; /* Max 64 ppas per request */
 	unsigned int bio_init_idx;
+	unsigned long read_bitmap; /* Max 64 ppas per request */
 	int ret = NVM_IO_ERR;
 
 	/* logic error: lba out-of-bounds. Ignore read request */
@@ -312,6 +355,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	rqd->private = pblk;
 	rqd->end_io = pblk_end_io_read;
 
+	r_ctx = nvm_rq_to_pdu(rqd);
+	r_ctx->lba = blba;
+
 	/* Save the index for this bio's start. This is needed in case
 	 * we need to fill a partial read.
 	 */
@@ -344,7 +390,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	/* All sectors are to be read from the device */
 	if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
 		struct bio *int_bio = NULL;
-		struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 
 		/* Clone read bio to deal with read errors internally */
 		int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 12a20f800c261..4a51e6d4d0361 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -99,6 +99,7 @@ enum {
 };
 
 #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS)
 
 /* write buffer completion context */
 struct pblk_c_ctx {
@@ -110,9 +111,10 @@ struct pblk_c_ctx {
 	unsigned int nr_padded;
 };
 
-/* generic context */
+/* read context */
 struct pblk_g_ctx {
 	void *private;
+	u64 lba;
 };
 
 /* Pad context */
-- 
cgit v1.2.3


From 7bd4d370db6090004a06deb526f0f01fa99a3f9f Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:23 +0200
Subject: lightnvm: pblk: guarantee line integrity on reads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a line is recycled during garbage collection, reads can still be
issued to the line. If the line is freed in the middle of this process,
data corruption might occur.

This patch guarantees that lines are not freed in the middle of reads
that target them (lines). Specifically, we use the existing line
reference to decide when a line is eligible for being freed after the
recycle process.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 56 ++++++++++++++++++++++++++++++----
 drivers/lightnvm/pblk-init.c | 14 +++++++--
 drivers/lightnvm/pblk-read.c | 71 +++++++++++++++++++++++++++++++++-----------
 drivers/lightnvm/pblk.h      |  2 ++
 4 files changed, 118 insertions(+), 25 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 08d166ac4f3c0..0a41fb998d556 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1460,10 +1460,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
 	line->emeta = NULL;
 }
 
-void pblk_line_put(struct kref *ref)
+static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 {
-	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
-	struct pblk *pblk = line->pblk;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 
 	spin_lock(&line->lock);
@@ -1481,6 +1479,43 @@ void pblk_line_put(struct kref *ref)
 	pblk_rl_free_lines_inc(&pblk->rl, line);
 }
 
+static void pblk_line_put_ws(struct work_struct *work)
+{
+	struct pblk_line_ws *line_put_ws = container_of(work,
+						struct pblk_line_ws, ws);
+	struct pblk *pblk = line_put_ws->pblk;
+	struct pblk_line *line = line_put_ws->line;
+
+	__pblk_line_put(pblk, line);
+	mempool_free(line_put_ws, pblk->gen_ws_pool);
+}
+
+void pblk_line_put(struct kref *ref)
+{
+	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+	struct pblk *pblk = line->pblk;
+
+	__pblk_line_put(pblk, line);
+}
+
+void pblk_line_put_wq(struct kref *ref)
+{
+	struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+	struct pblk *pblk = line->pblk;
+	struct pblk_line_ws *line_put_ws;
+
+	line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+	if (!line_put_ws)
+		return;
+
+	line_put_ws->pblk = pblk;
+	line_put_ws->line = line;
+	line_put_ws->priv = NULL;
+
+	INIT_WORK(&line_put_ws->ws, pblk_line_put_ws);
+	queue_work(pblk->r_end_wq, &line_put_ws->ws);
+}
+
 int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 {
 	struct nvm_rq *rqd;
@@ -1878,8 +1913,19 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
 	int i;
 
 	spin_lock(&pblk->trans_lock);
-	for (i = 0; i < nr_secs; i++)
-		ppas[i] = pblk_trans_map_get(pblk, blba + i);
+	for (i = 0; i < nr_secs; i++) {
+		struct ppa_addr ppa;
+
+		ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
+
+		/* If the L2P entry maps to a line, the reference is valid */
+		if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
+			int line_id = pblk_dev_ppa_to_line(ppa);
+			struct pblk_line *line = &pblk->lines[line_id];
+
+			kref_get(&line->ref);
+		}
+	}
 	spin_unlock(&pblk->trans_lock);
 }
 
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 4d719782f65b4..34527646c01be 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -271,15 +271,22 @@ static int pblk_core_init(struct pblk *pblk)
 	if (!pblk->bb_wq)
 		goto free_close_wq;
 
-	if (pblk_set_ppaf(pblk))
+	pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
+			WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+	if (!pblk->r_end_wq)
 		goto free_bb_wq;
 
+	if (pblk_set_ppaf(pblk))
+		goto free_r_end_wq;
+
 	if (pblk_rwb_init(pblk))
-		goto free_bb_wq;
+		goto free_r_end_wq;
 
 	INIT_LIST_HEAD(&pblk->compl_list);
 	return 0;
 
+free_r_end_wq:
+	destroy_workqueue(pblk->r_end_wq);
 free_bb_wq:
 	destroy_workqueue(pblk->bb_wq);
 free_close_wq:
@@ -304,6 +311,9 @@ static void pblk_core_free(struct pblk *pblk)
 	if (pblk->close_wq)
 		destroy_workqueue(pblk->close_wq);
 
+	if (pblk->r_end_wq)
+		destroy_workqueue(pblk->r_end_wq);
+
 	if (pblk->bb_wq)
 		destroy_workqueue(pblk->bb_wq);
 
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index a465d9980df4c..402f8eff6a2e8 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -130,9 +130,34 @@ static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
 	}
 }
 
-static void pblk_end_io_read(struct nvm_rq *rqd)
+static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct ppa_addr *ppa_list;
+	int i;
+
+	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+	for (i = 0; i < rqd->nr_ppas; i++) {
+		struct ppa_addr ppa = ppa_list[i];
+		struct pblk_line *line;
+
+		line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+		kref_put(&line->ref, pblk_line_put_wq);
+	}
+}
+
+static void pblk_end_user_read(struct bio *bio)
+{
+#ifdef CONFIG_NVM_DEBUG
+	WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
+#endif
+	bio_endio(bio);
+	bio_put(bio);
+}
+
+static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
+			       bool put_line)
 {
-	struct pblk *pblk = rqd->private;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct bio *bio = rqd->bio;
 
@@ -146,15 +171,11 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 	pblk_read_check(pblk, rqd, r_ctx->lba);
 
 	bio_put(bio);
-	if (r_ctx->private) {
-		struct bio *orig_bio = r_ctx->private;
+	if (r_ctx->private)
+		pblk_end_user_read((struct bio *)r_ctx->private);
 
-#ifdef CONFIG_NVM_DEBUG
-		WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n");
-#endif
-		bio_endio(orig_bio);
-		bio_put(orig_bio);
-	}
+	if (put_line)
+		pblk_read_put_rqd_kref(pblk, rqd);
 
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
@@ -165,6 +186,13 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 	atomic_dec(&pblk->inflight_io);
 }
 
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+	struct pblk *pblk = rqd->private;
+
+	__pblk_end_io_read(pblk, rqd, true);
+}
+
 static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 				      unsigned int bio_init_idx,
 				      unsigned long *read_bitmap)
@@ -233,8 +261,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	}
 
 	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+		struct ppa_addr ppa;
+
+		ppa = rqd->ppa_addr;
 		rqd->ppa_list = ppa_ptr;
 		rqd->dma_ppa_list = dma_ppa_list;
+		rqd->ppa_list[0] = ppa;
 	}
 
 	for (i = 0; i < nr_secs; i++) {
@@ -246,6 +278,11 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	i = 0;
 	hole = find_first_zero_bit(read_bitmap, nr_secs);
 	do {
+		int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
+		struct pblk_line *line = &pblk->lines[line_id];
+
+		kref_put(&line->ref, pblk_line_put);
+
 		meta_list[hole].lba = lba_list_media[i];
 
 		src_bv = new_bio->bi_io_vec[i++];
@@ -269,19 +306,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	bio_put(new_bio);
 
 	/* Complete the original bio and associated request */
+	bio_endio(bio);
 	rqd->bio = bio;
 	rqd->nr_ppas = nr_secs;
-	rqd->private = pblk;
 
-	bio_endio(bio);
-	pblk_end_io_read(rqd);
+	__pblk_end_io_read(pblk, rqd, false);
 	return NVM_IO_OK;
 
 err:
 	/* Free allocated pages in new bio */
 	pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
-	rqd->private = pblk;
-	pblk_end_io_read(rqd);
+	__pblk_end_io_read(pblk, rqd, false);
 	return NVM_IO_ERR;
 }
 
@@ -314,11 +349,11 @@ retry:
 			goto retry;
 		}
 
+		WARN_ON(test_and_set_bit(0, read_bitmap));
 		meta_list[0].lba = cpu_to_le64(lba);
 
-		WARN_ON(test_and_set_bit(0, read_bitmap));
 #ifdef CONFIG_NVM_DEBUG
-			atomic_long_inc(&pblk->cache_reads);
+		atomic_long_inc(&pblk->cache_reads);
 #endif
 	} else {
 		rqd->ppa_addr = ppa;
@@ -383,7 +418,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	if (bitmap_full(&read_bitmap, nr_secs)) {
 		bio_endio(bio);
 		atomic_inc(&pblk->inflight_io);
-		pblk_end_io_read(rqd);
+		__pblk_end_io_read(pblk, rqd, false);
 		return NVM_IO_OK;
 	}
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 4a51e6d4d0361..e4704373398bc 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -636,6 +636,7 @@ struct pblk {
 
 	struct workqueue_struct *close_wq;
 	struct workqueue_struct *bb_wq;
+	struct workqueue_struct *r_end_wq;
 
 	struct timer_list wtimer;
 
@@ -741,6 +742,7 @@ int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
 			 void *emeta_buf);
 int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
 void pblk_line_put(struct kref *ref);
+void pblk_line_put_wq(struct kref *ref);
 struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
 u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line);
 void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
-- 
cgit v1.2.3


From 0f9248cf1e22333b2a0458540aafb1ad3b2b3337 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:24 +0200
Subject: lightnvm: pblk: remove redundant check on read path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A partial read I/O in pblk is an I/O where some sectors reside in the
write buffer in main memory and some are persisted on the device. Such
an I/O must at least contain 2 lbas, therefore checking for the case
where a single lba is mapped is not necessary.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 402f8eff6a2e8..71c58503f1a45 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -235,7 +235,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->end_io = pblk_end_io_sync;
 	rqd->private = &wait;
 
-	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+	if (unlikely(nr_holes == 1)) {
 		ppa_ptr = rqd->ppa_list;
 		dma_ppa_list = rqd->dma_ppa_list;
 		rqd->ppa_addr = rqd->ppa_list[0];
@@ -260,7 +260,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 #endif
 	}
 
-	if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+	if (unlikely(nr_holes == 1)) {
 		struct ppa_addr ppa;
 
 		ppa = rqd->ppa_addr;
-- 
cgit v1.2.3


From 1e82123da6a4c6019ef03bcd47e4b3dc18dd136e Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:25 +0200
Subject: lightnvm: pblk: remove I/O dependency on write path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk schedules user I/O, metadata I/O and erases on the write path in
order to minimize collisions at the media level. Until now, there has
been a dependency between user and metadata I/Os that could lead to a
deadlock as both take the per-LUN semaphore to schedule submission.

This path removes this dependency and guarantees forward progress at a
per I/O granurality.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 145 +++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 80 deletions(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index f2e846fe9242a..6c1cafafef532 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -220,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 }
 
 static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			   struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa)
+			   struct ppa_addr *erase_ppa)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line *e_line = pblk_line_get_erase(pblk);
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 	unsigned int valid = c_ctx->nr_valid;
 	unsigned int padded = c_ctx->nr_padded;
 	unsigned int nr_secs = valid + padded;
 	unsigned long *lun_bitmap;
-	int ret = 0;
+	int ret;
 
 	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
 	if (!lun_bitmap)
@@ -294,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
 	return secs_to_sync;
 }
 
-static inline int pblk_valid_meta_ppa(struct pblk *pblk,
-				      struct pblk_line *meta_line,
-				      struct ppa_addr *ppa_list, int nr_ppas)
-{
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line *data_line;
-	struct ppa_addr ppa, ppa_opt;
-	u64 paddr;
-	int i;
-
-	data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])];
-	paddr = pblk_lookup_page(pblk, meta_line);
-	ppa = addr_to_gen_ppa(pblk, paddr, 0);
-
-	if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap))
-		return 1;
-
-	/* Schedule a metadata I/O that is half the distance from the data I/O
-	 * with regards to the number of LUNs forming the pblk instance. This
-	 * balances LUN conflicts across every I/O.
-	 *
-	 * When the LUN configuration changes (e.g., due to GC), this distance
-	 * can align, which would result on a LUN deadlock. In this case, modify
-	 * the distance to not be optimal, but allow metadata I/Os to succeed.
-	 */
-	ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
-	if (unlikely(ppa_opt.ppa == ppa.ppa)) {
-		data_line->meta_distance--;
-		return 0;
-	}
-
-	for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
-		if (ppa_list[i].g.ch == ppa_opt.g.ch &&
-					ppa_list[i].g.lun == ppa_opt.g.lun)
-			return 1;
-
-	if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) {
-		for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
-			if (ppa_list[i].g.ch == ppa.g.ch &&
-						ppa_list[i].g.lun == ppa.g.lun)
-				return 0;
-
-		return 1;
-	}
-
-	return 0;
-}
-
 int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
@@ -421,8 +373,44 @@ fail_free_rqd:
 	return ret;
 }
 
-static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
-			       int prev_n)
+static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
+				       struct pblk_line *meta_line,
+				       struct nvm_rq *data_rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
+	struct pblk_line *data_line = pblk_line_get_data(pblk);
+	struct ppa_addr ppa, ppa_opt;
+	u64 paddr;
+	int pos_opt;
+
+	/* Schedule a metadata I/O that is half the distance from the data I/O
+	 * with regards to the number of LUNs forming the pblk instance. This
+	 * balances LUN conflicts across every I/O.
+	 *
+	 * When the LUN configuration changes (e.g., due to GC), this distance
+	 * can align, which would result on metadata and data I/Os colliding. In
+	 * this case, modify the distance to not be optimal, but move the
+	 * optimal in the right direction.
+	 */
+	paddr = pblk_lookup_page(pblk, meta_line);
+	ppa = addr_to_gen_ppa(pblk, paddr, 0);
+	ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
+	pos_opt = pblk_ppa_to_pos(geo, ppa_opt);
+
+	if (test_bit(pos_opt, data_c_ctx->lun_bitmap) ||
+				test_bit(pos_opt, data_line->blk_bitmap))
+		return true;
+
+	if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
+		data_line->meta_distance--;
+
+	return false;
+}
+
+static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
+						    struct nvm_rq *data_rqd)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -432,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
 retry:
 	if (list_empty(&l_mg->emeta_list)) {
 		spin_unlock(&l_mg->close_lock);
-		return 0;
+		return NULL;
 	}
 	meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list);
 	if (meta_line->emeta->mem >= lm->emeta_len[0])
 		goto retry;
 	spin_unlock(&l_mg->close_lock);
 
-	if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n))
-		return 0;
+	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
+		return NULL;
 
-	return pblk_submit_meta_io(pblk, meta_line);
+	return meta_line;
 }
 
 static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
 {
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 	struct ppa_addr erase_ppa;
+	struct pblk_line *meta_line;
 	int err;
 
 	ppa_set_empty(&erase_ppa);
 
 	/* Assign lbas to ppas and populate request structure */
-	err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa);
+	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
 	if (err) {
 		pr_err("pblk: could not setup write request: %d\n", err);
 		return NVM_IO_ERR;
 	}
 
-	if (likely(ppa_empty(erase_ppa))) {
-		/* Submit metadata write for previous data line */
-		err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas);
-		if (err) {
-			pr_err("pblk: metadata I/O submission failed: %d", err);
-			return NVM_IO_ERR;
-		}
+	meta_line = pblk_should_submit_meta_io(pblk, rqd);
 
-		/* Submit data write for current data line */
-		err = pblk_submit_io(pblk, rqd);
-		if (err) {
-			pr_err("pblk: data I/O submission failed: %d\n", err);
-			return NVM_IO_ERR;
-		}
-	} else {
-		/* Submit data write for current data line */
-		err = pblk_submit_io(pblk, rqd);
-		if (err) {
-			pr_err("pblk: data I/O submission failed: %d\n", err);
-			return NVM_IO_ERR;
-		}
+	/* Submit data write for current data line */
+	err = pblk_submit_io(pblk, rqd);
+	if (err) {
+		pr_err("pblk: data I/O submission failed: %d\n", err);
+		return NVM_IO_ERR;
+	}
 
-		/* Submit available erase for next data line */
+	if (!ppa_empty(erase_ppa)) {
+		/* Submit erase for next data line */
 		if (pblk_blk_erase_async(pblk, erase_ppa)) {
 			struct pblk_line *e_line = pblk_line_get_erase(pblk);
 			struct nvm_tgt_dev *dev = pblk->dev;
@@ -495,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
 		}
 	}
 
+	if (meta_line) {
+		/* Submit metadata write for previous data line */
+		err = pblk_submit_meta_io(pblk, meta_line);
+		if (err) {
+			pr_err("pblk: metadata I/O submission failed: %d", err);
+			return NVM_IO_ERR;
+		}
+	}
+
 	return NVM_IO_OK;
 }
 
-- 
cgit v1.2.3


From 21d2287119e843929c29fb1adbd271bde1fac7ae Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:26 +0200
Subject: lightnvm: pblk: enable 1 LUN configuration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Metadata I/Os are scheduled to minimize their impact on user data I/Os.
When there are enough LUNs instantiated (i.e., enough bandwidth), it is
easy to interleave metadata and data one after the other so that
metadata I/Os are the ones being blocked and not vice-versa.

We do this by calculating the distance between the I/Os in terms of the
LUNs that are not in used, and selecting a free LUN that satisfies a
the simple heuristic that metadata is scheduled behind. The per-LUN
semaphores guarantee consistency. This works fine on >1 LUN
configuration. However, when a single LUN is instantiated, this design
leads to a deadlock, where metadata waits to be scheduled on a free LUN.

This patch implements the 1 LUN case by simply scheduling the metadada
I/O after the data I/O. In the process, we refactor the way a line is
replaced to ensure that metadata writes are submitted after data writes
in order to guarantee block sequentiality. Note that, since there is
only one LUN, both I/Os will block each other by design. However, such
configuration only pursues tight read latencies, not write bandwidth.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 17 ++++++++++-------
 drivers/lightnvm/pblk-init.c |  8 ++++++--
 drivers/lightnvm/pblk-map.c  | 21 ++++++++++++---------
 drivers/lightnvm/pblk.h      |  2 +-
 4 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 0a41fb998d556..e38e91897246d 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1372,17 +1372,17 @@ void pblk_pipeline_stop(struct pblk *pblk)
 	spin_unlock(&l_mg->free_lock);
 }
 
-void pblk_line_replace_data(struct pblk *pblk)
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *cur, *new;
+	struct pblk_line *cur, *new = NULL;
 	unsigned int left_seblks;
 	int is_next = 0;
 
 	cur = l_mg->data_line;
 	new = l_mg->data_next;
 	if (!new)
-		return;
+		goto out;
 	l_mg->data_line = new;
 
 	spin_lock(&l_mg->free_lock);
@@ -1390,7 +1390,7 @@ void pblk_line_replace_data(struct pblk *pblk)
 		l_mg->data_line = NULL;
 		l_mg->data_next = NULL;
 		spin_unlock(&l_mg->free_lock);
-		return;
+		goto out;
 	}
 
 	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
@@ -1402,7 +1402,7 @@ retry_erase:
 		/* If line is not fully erased, erase it */
 		if (atomic_read(&new->left_eblks)) {
 			if (pblk_line_erase(pblk, new))
-				return;
+				goto out;
 		} else {
 			io_schedule();
 		}
@@ -1413,7 +1413,7 @@ retry_setup:
 	if (!pblk_line_init_metadata(pblk, new, cur)) {
 		new = pblk_line_retry(pblk, new);
 		if (!new)
-			return;
+			goto out;
 
 		goto retry_setup;
 	}
@@ -1421,7 +1421,7 @@ retry_setup:
 	if (!pblk_line_init_bb(pblk, new, 1)) {
 		new = pblk_line_retry(pblk, new);
 		if (!new)
-			return;
+			goto out;
 
 		goto retry_setup;
 	}
@@ -1445,6 +1445,9 @@ retry_setup:
 
 	if (is_next)
 		pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+out:
+	return new;
 }
 
 void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 34527646c01be..3c37491860537 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -711,8 +711,12 @@ add_emeta_page:
 	}
 
 	lm->emeta_bb = geo->nr_luns - i;
-	lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0],
-							geo->sec_per_blk);
+
+	lm->min_blk_line = 1;
+	if (geo->nr_luns > 1)
+		lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
+					lm->emeta_sec[0], geo->sec_per_blk);
+
 	if (lm->min_blk_line > lm->blk_per_line) {
 		pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
 							lm->blk_per_line);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index fddb924f6dde7..3bc4c94f9cf2d 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -25,13 +25,23 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			       unsigned int valid_secs)
 {
 	struct pblk_line *line = pblk_line_get_data(pblk);
-	struct pblk_emeta *emeta = line->emeta;
+	struct pblk_emeta *emeta;
 	struct pblk_w_ctx *w_ctx;
-	__le64 *lba_list = emeta_to_lbas(pblk, emeta->buf);
+	__le64 *lba_list;
 	u64 paddr;
 	int nr_secs = pblk->min_write_pgs;
 	int i;
 
+	if (pblk_line_is_full(line)) {
+		struct pblk_line *prev_line = line;
+
+		line = pblk_line_replace_data(pblk);
+		pblk_line_close_meta(pblk, prev_line);
+	}
+
+	emeta = line->emeta;
+	lba_list = emeta_to_lbas(pblk, emeta->buf);
+
 	paddr = pblk_alloc_page(pblk, line, nr_secs);
 
 	for (i = 0; i < nr_secs; i++, paddr++) {
@@ -60,13 +70,6 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 		}
 	}
 
-	if (pblk_line_is_full(line)) {
-		struct pblk_line *prev_line = line;
-
-		pblk_line_replace_data(pblk);
-		pblk_line_close_meta(pblk, prev_line);
-	}
-
 	pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
 }
 
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index e4704373398bc..191b1ec0627b0 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -719,7 +719,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 			      int alloc_type, gfp_t gfp_mask);
 struct pblk_line *pblk_line_get(struct pblk *pblk);
 struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
-void pblk_line_replace_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
 int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
 struct pblk_line *pblk_line_get_data(struct pblk *pblk);
-- 
cgit v1.2.3


From e6b754c252bacebdfbe3c57e790431ab8f445d1f Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:27 +0200
Subject: lightnvm: pblk: ensure right bad block calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure that the variable controlling block threshold for allocating
extra metadata sectors in case of a line with bad blocks does not get a
negative value. Otherwise, the line will be marked as corrupted and
wasted.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 3c37491860537..c7239c41ba40f 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -710,7 +710,7 @@ add_emeta_page:
 		goto add_emeta_page;
 	}
 
-	lm->emeta_bb = geo->nr_luns - i;
+	lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
 
 	lm->min_blk_line = 1;
 	if (geo->nr_luns > 1)
-- 
cgit v1.2.3


From 27b978725d895e704aab44b99242a0514485d798 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:28 +0200
Subject: lightnvm: pblk: fix changing GC group list for a line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk_line_gc_list seems to had a bug since the introduction of pblk in
getting GC list for a line. In b20ba1bc7 while redesigning the GC
algorithm, the naming for the GC thresholds was altered, but the
values for high_thrs and mid_thrs were not. The result is that when
moving to the GC lists, the mid threshold is never evaluated.

Fixes: a4bd217b4("lightnvm: physical block device (pblk) target")
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index c7239c41ba40f..56ece7dfac0e6 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -678,8 +678,8 @@ static int pblk_lines_init(struct pblk *pblk)
 	lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
 	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
 	lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
-	lm->high_thrs = lm->sec_per_line / 2;
-	lm->mid_thrs = lm->sec_per_line / 4;
+	lm->mid_thrs = lm->sec_per_line / 2;
+	lm->high_thrs = lm->sec_per_line / 4;
 	lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
 
 	/* Calculate necessary pages for smeta. See comment over struct
-- 
cgit v1.2.3


From e480689bd1cc35f6ed3fa628bc8d913177b0726a Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:29 +0200
Subject: lightnvm: pblk: remove useless line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index e38e91897246d..43866ad87586a 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1071,7 +1071,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 	/* Mark emeta metadata sectors as bad sectors. We need to consider bad
 	 * blocks to make sure that there are enough sectors to store emeta
 	 */
-	bit = lm->sec_per_line;
 	off = lm->sec_per_line - lm->emeta_sec[0];
 	bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]);
 	while (nr_bb) {
-- 
cgit v1.2.3


From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:30 +0200
Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vblk isn't being used anyway and if we ever have a usecase we can
introduce this again.  This makes the logic easier and removes
unnecessary checks.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c  | 29 ++++++++++++-----------------
 include/linux/lightnvm.h |  2 +-
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 798964f511cd0..231c928994315 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -633,7 +633,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
@@ -697,7 +697,7 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	rqd.private = &wait;
 	rqd.flags = geo->plane_mode >> 1;
 
-	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
+	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	if (ret)
 		return ret;
 
@@ -793,14 +793,14 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 EXPORT_SYMBOL(nvm_put_area);
 
 int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			const struct ppa_addr *ppas, int nr_ppas, int vblk)
+			const struct ppa_addr *ppas, int nr_ppas)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 	struct nvm_geo *geo = &tgt_dev->geo;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 
-	if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
+	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
 		rqd->nr_ppas = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 
@@ -814,19 +814,14 @@ int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 		return -ENOMEM;
 	}
 
-	if (!vblk) {
-		for (i = 0; i < nr_ppas; i++)
-			rqd->ppa_list[i] = ppas[i];
-	} else {
-		plane_cnt = geo->plane_mode;
-		rqd->nr_ppas *= plane_cnt;
-
-		for (i = 0; i < nr_ppas; i++) {
-			for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-				ppa = ppas[i];
-				ppa.g.pl = pl_idx;
-				rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
-			}
+	plane_cnt = geo->plane_mode;
+	rqd->nr_ppas *= plane_cnt;
+
+	for (i = 0; i < nr_ppas; i++) {
+		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+			ppa = ppas[i];
+			ppa.g.pl = pl_idx;
+			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
 		}
 	}
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7b80ac911d26e..32ec35b5e18be 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -481,7 +481,7 @@ extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
-					const struct ppa_addr *, int, int);
+					const struct ppa_addr *, int);
 extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-- 
cgit v1.2.3


From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:31 +0200
Subject: lightnvm: remove stale extern and unused exported symbols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not all exported symbols are being used outside core and there were
some stale entries in lightnvm.h

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c  | 129 +++++++++++++++++++++++------------------------
 include/linux/lightnvm.h |   7 ---
 2 files changed, 64 insertions(+), 72 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 231c928994315..0e5f77234c79a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = {
 	.owner		= THIS_MODULE,
 };
 
+static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
+{
+	struct nvm_tgt_type *tmp, *tt = NULL;
+
+	if (lock)
+		down_write(&nvm_tgtt_lock);
+
+	list_for_each_entry(tmp, &nvm_tgt_types, list)
+		if (!strcmp(name, tmp->name)) {
+			tt = tmp;
+			break;
+		}
+
+	if (lock)
+		up_write(&nvm_tgtt_lock);
+	return tt;
+}
+
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
 	struct nvm_ioctl_create_simple *s = &create->conf.s;
@@ -549,25 +567,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
 }
 EXPORT_SYMBOL(nvm_part_to_tgt);
 
-struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
-{
-	struct nvm_tgt_type *tmp, *tt = NULL;
-
-	if (lock)
-		down_write(&nvm_tgtt_lock);
-
-	list_for_each_entry(tmp, &nvm_tgt_types, list)
-		if (!strcmp(name, tmp->name)) {
-			tt = tmp;
-			break;
-		}
-
-	if (lock)
-		up_write(&nvm_tgtt_lock);
-	return tt;
-}
-EXPORT_SYMBOL(nvm_find_target_type);
-
 int nvm_register_tgt_type(struct nvm_tgt_type *tt)
 {
 	int ret = 0;
@@ -619,6 +618,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
+static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+			const struct ppa_addr *ppas, int nr_ppas)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_geo *geo = &tgt_dev->geo;
+	int i, plane_cnt, pl_idx;
+	struct ppa_addr ppa;
+
+	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+		rqd->nr_ppas = nr_ppas;
+		rqd->ppa_addr = ppas[0];
+
+		return 0;
+	}
+
+	rqd->nr_ppas = nr_ppas;
+	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+	if (!rqd->ppa_list) {
+		pr_err("nvm: failed to allocate dma memory\n");
+		return -ENOMEM;
+	}
+
+	plane_cnt = geo->plane_mode;
+	rqd->nr_ppas *= plane_cnt;
+
+	for (i = 0; i < nr_ppas; i++) {
+		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+			ppa = ppas[i];
+			ppa.g.pl = pl_idx;
+			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
+		}
+	}
+
+	return 0;
+}
+
+static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
+			struct nvm_rq *rqd)
+{
+	if (!rqd->ppa_list)
+		return;
+
+	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+}
+
+
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 		       int nr_ppas, int type)
 {
@@ -792,52 +837,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
-			const struct ppa_addr *ppas, int nr_ppas)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_geo *geo = &tgt_dev->geo;
-	int i, plane_cnt, pl_idx;
-	struct ppa_addr ppa;
-
-	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
-		rqd->nr_ppas = nr_ppas;
-		rqd->ppa_addr = ppas[0];
-
-		return 0;
-	}
-
-	rqd->nr_ppas = nr_ppas;
-	rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
-	if (!rqd->ppa_list) {
-		pr_err("nvm: failed to allocate dma memory\n");
-		return -ENOMEM;
-	}
-
-	plane_cnt = geo->plane_mode;
-	rqd->nr_ppas *= plane_cnt;
-
-	for (i = 0; i < nr_ppas; i++) {
-		for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
-			ppa = ppas[i];
-			ppa.g.pl = pl_idx;
-			rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(nvm_set_rqd_ppalist);
-
-void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
-{
-	if (!rqd->ppa_list)
-		return;
-
-	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-}
-EXPORT_SYMBOL(nvm_free_rqd_ppalist);
-
 void nvm_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *tgt_dev = rqd->dev;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 32ec35b5e18be..4f0e4a0fd2047 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -463,8 +463,6 @@ struct nvm_tgt_type {
 	struct module *owner;
 };
 
-extern struct nvm_tgt_type *nvm_find_target_type(const char *, int);
-
 extern int nvm_register_tgt_type(struct nvm_tgt_type *);
 extern void nvm_unregister_tgt_type(struct nvm_tgt_type *);
 
@@ -480,9 +478,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
-					const struct ppa_addr *, int);
-extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
@@ -491,8 +486,6 @@ extern void nvm_end_io(struct nvm_rq *);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
-extern int nvm_dev_factory(struct nvm_dev *, int flags);
-
 extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int);
 
 #else /* CONFIG_NVM */
-- 
cgit v1.2.3


From 05ed3447698203219319ec9d1c46303aff5932a2 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:32 +0200
Subject: lightnvm: pblk: reduce arguments in __pblk_rb_update_l2p
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We already pass the structure pointer so no need to pass the member.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-rb.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 1173e2380137d..b8f78e4014823 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
 	return subm;
 }
 
-static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
-				unsigned int to_update)
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
 {
 	struct pblk *pblk = container_of(rb, struct pblk, rwb);
 	struct pblk_line *line;
@@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
 	int flags;
 
 	for (i = 0; i < to_update; i++) {
-		entry = &rb->entries[*l2p_upd];
+		entry = &rb->entries[rb->l2p_update];
 		w_ctx = &entry->w_ctx;
 
 		flags = READ_ONCE(entry->w_ctx.flags);
@@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
 		line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
 		kref_put(&line->ref, pblk_line_put);
 		clean_wctx(w_ctx);
-		*l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+		rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
 	}
 
 	pblk_rl_out(&pblk->rl, user_io, gc_io);
@@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
 
 	count = nr_entries - space;
 	/* l2p_update used exclusively under rb->w_lock */
-	ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+	ret = __pblk_rb_update_l2p(rb, count);
 
 out:
 	return ret;
@@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb)
 	sync = smp_load_acquire(&rb->sync);
 
 	to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
-	__pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+	__pblk_rb_update_l2p(rb, to_update);
 
 	spin_unlock(&rb->w_lock);
 }
-- 
cgit v1.2.3


From 22a4e061ea11cc754785a12b3ba5f3e135bc0c63 Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Fri, 13 Oct 2017 14:46:33 +0200
Subject: lightnvm: pblk: fix releases of kmem cache in error path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If pblk_core_init fails lets destroy all global caches.

Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 56ece7dfac0e6..2e599738372dd 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -220,6 +220,14 @@ static int pblk_init_global_caches(struct pblk *pblk)
 	return 0;
 }
 
+static void pblk_free_global_caches(struct pblk *pblk)
+{
+	kmem_cache_destroy(pblk_ws_cache);
+	kmem_cache_destroy(pblk_rec_cache);
+	kmem_cache_destroy(pblk_g_rq_cache);
+	kmem_cache_destroy(pblk_w_rq_cache);
+}
+
 static int pblk_core_init(struct pblk *pblk)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
@@ -235,7 +243,7 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
 									0);
 	if (!pblk->page_bio_pool)
-		return -ENOMEM;
+		goto free_global_caches;
 
 	pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
 							pblk_ws_cache);
@@ -303,6 +311,8 @@ free_gen_ws_pool:
 	mempool_destroy(pblk->gen_ws_pool);
 free_page_bio_pool:
 	mempool_destroy(pblk->page_bio_pool);
+free_global_caches:
+	pblk_free_global_caches(pblk);
 	return -ENOMEM;
 }
 
@@ -324,10 +334,7 @@ static void pblk_core_free(struct pblk *pblk)
 	mempool_destroy(pblk->e_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
 
-	kmem_cache_destroy(pblk_ws_cache);
-	kmem_cache_destroy(pblk_rec_cache);
-	kmem_cache_destroy(pblk_g_rq_cache);
-	kmem_cache_destroy(pblk_w_rq_cache);
+	pblk_free_global_caches(pblk);
 }
 
 static void pblk_luns_free(struct pblk *pblk)
-- 
cgit v1.2.3


From 3e3a5b8ebd5d3b1d68facc58b0674a2564653222 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:34 +0200
Subject: lightnvm: pblk: prevent gc kicks when gc is not operational
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GC can be kicked after it has been shut down when closing the last
line during exit, resulting in accesses to freed structures.

Make sure that GC is not triggered while it is not operational.
Also make sure that GC won't be re-activated during exit when
running on another processor by using timer_del_sync.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c   | 9 +++++----
 drivers/lightnvm/pblk-init.c | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 7b103bce58bf7..81efac18ff571 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -478,10 +478,10 @@ void pblk_gc_should_start(struct pblk *pblk)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
-	if (gc->gc_enabled && !gc->gc_active)
+	if (gc->gc_enabled && !gc->gc_active) {
 		pblk_gc_start(pblk);
-
-	pblk_gc_kick(pblk);
+		pblk_gc_kick(pblk);
+	}
 }
 
 /*
@@ -620,7 +620,8 @@ void pblk_gc_exit(struct pblk *pblk)
 	flush_workqueue(gc->gc_reader_wq);
 	flush_workqueue(gc->gc_line_reader_wq);
 
-	del_timer(&gc->gc_timer);
+	gc->gc_enabled = 0;
+	del_timer_sync(&gc->gc_timer);
 	pblk_gc_stop(pblk, 1);
 
 	if (gc->gc_ts)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 2e599738372dd..27eb430958ff6 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -931,6 +931,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	pblk->dev = dev;
 	pblk->disk = tdisk;
 	pblk->state = PBLK_STATE_RUNNING;
+	pblk->gc.gc_enabled = 0;
 
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
-- 
cgit v1.2.3


From 92957091e93931c91fccb7cce456312edeeea36c Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:35 +0200
Subject: lightnvm: pblk: recover partially written lines correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When recovering partially written lines, the valid sector
count must be decreased by the number of padded sectors
in the line.

Update line recovery to take all ADDR_EMPTY(padded) sectors
into account.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 686bc17f080ff..a080cf8889821 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -133,8 +133,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_emeta *emeta = line->emeta;
 	struct line_emeta *emeta_buf = emeta->buf;
 	__le64 *lba_list;
-	int data_start;
-	int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+	int data_start, data_end;
+	int nr_valid_lbas, nr_lbas = 0;
 	int i;
 
 	lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
@@ -142,10 +142,10 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
 		return 1;
 
 	data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
-	nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0];
+	data_end = lm->sec_per_line - lm->emeta_sec[0];
 	nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
 
-	for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+	for (i = data_start; i < data_end; i++) {
 		struct ppa_addr ppa;
 		int pos;
 
-- 
cgit v1.2.3


From 37ce33d5756f4ba8bdd45371a1918ceeeba5b158 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:36 +0200
Subject: lightnvm: pblk: free full lines during recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When rebuilding the L2P table, any full lines (lines without any
valid sectors) will be identified. If these lines are not freed,
we risk not being able to allocate the first data line.

This patch refactors the part of GC that frees empty lines
into a separate function and adds a call to this after the
L2P table has been rebuilt.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c   | 32 ++++++++++++++++++++------------
 drivers/lightnvm/pblk-init.c |  3 +++
 drivers/lightnvm/pblk.h      |  1 +
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 81efac18ff571..374089fe4326b 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -330,26 +330,16 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
 	return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
 }
 
-/*
- * Lines with no valid sectors will be returned to the free list immediately. If
- * GC is activated - either because the free block count is under the determined
- * threshold, or because it is being forced from user space - only lines with a
- * high count of invalid sectors will be recycled.
- */
-static void pblk_gc_run(struct pblk *pblk)
+void pblk_gc_free_full_lines(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_gc *gc = &pblk->gc;
 	struct pblk_line *line;
-	struct list_head *group_list;
-	bool run_gc;
-	int inflight_gc, gc_group = 0, prev_group = 0;
 
 	do {
 		spin_lock(&l_mg->gc_lock);
 		if (list_empty(&l_mg->gc_full_list)) {
 			spin_unlock(&l_mg->gc_lock);
-			break;
+			return;
 		}
 
 		line = list_first_entry(&l_mg->gc_full_list,
@@ -365,6 +355,24 @@ static void pblk_gc_run(struct pblk *pblk)
 
 		kref_put(&line->ref, pblk_line_put);
 	} while (1);
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_gc *gc = &pblk->gc;
+	struct pblk_line *line;
+	struct list_head *group_list;
+	bool run_gc;
+	int inflight_gc, gc_group = 0, prev_group = 0;
+
+	pblk_gc_free_full_lines(pblk);
 
 	run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
 	if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD))
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 27eb430958ff6..f08fa2083fbcc 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -508,6 +508,9 @@ static int pblk_lines_configure(struct pblk *pblk, int flags)
 		}
 	}
 
+	/* Free full lines directly as GC has not been started yet */
+	pblk_gc_free_full_lines(pblk);
+
 	if (!line) {
 		/* Configure next line for user data */
 		line = pblk_line_get_first_data(pblk);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 191b1ec0627b0..21438d1550a24 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -831,6 +831,7 @@ void pblk_gc_exit(struct pblk *pblk);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
 void pblk_gc_kick(struct pblk *pblk);
+void pblk_gc_free_full_lines(struct pblk *pblk);
 void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
 			      int *gc_active);
 int pblk_gc_sysfs_force(struct pblk *pblk, int force);
-- 
cgit v1.2.3


From 03661b5f756c92b9924869334a2afa19753c4fe7 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:37 +0200
Subject: lightnvm: pblk: start gc if needed during init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Start GC if needed, directly after init, as we might
need to garbage collect in order to make room for user writes.

Create a helper function that allows to kick GC without exposing the
internals of the GC/rate-limiter interaction.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c   | 5 +++++
 drivers/lightnvm/pblk-init.c | 4 ++++
 drivers/lightnvm/pblk-rl.c   | 2 +-
 drivers/lightnvm/pblk.h      | 2 ++
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 374089fe4326b..4bac9e1531f5d 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -510,6 +510,11 @@ void pblk_gc_should_stop(struct pblk *pblk)
 		pblk_gc_stop(pblk, 0);
 }
 
+void pblk_gc_should_kick(struct pblk *pblk)
+{
+	pblk_rl_update_rates(&pblk->rl);
+}
+
 void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
 			      int *gc_active)
 {
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index f08fa2083fbcc..ad9f014a086b3 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1025,6 +1025,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 			pblk->rwb.nr_entries);
 
 	wake_up_process(pblk->writer_ts);
+
+	/* Check if we need to start GC */
+	pblk_gc_should_kick(pblk);
+
 	return pblk;
 
 fail_stop_writer:
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 0896439a91b0f..739f855d42165 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -96,7 +96,7 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
  *
  * Only the total number of free blocks is used to configure the rate limiter.
  */
-static void pblk_rl_update_rates(struct pblk_rl *rl)
+void pblk_rl_update_rates(struct pblk_rl *rl)
 {
 	struct pblk *pblk = container_of(rl, struct pblk, rl);
 	unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 21438d1550a24..29ba7ec32b204 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -830,6 +830,7 @@ int pblk_gc_init(struct pblk *pblk);
 void pblk_gc_exit(struct pblk *pblk);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
+void pblk_gc_should_kick(struct pblk *pblk);
 void pblk_gc_kick(struct pblk *pblk);
 void pblk_gc_free_full_lines(struct pblk *pblk);
 void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
@@ -841,6 +842,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force);
  */
 void pblk_rl_init(struct pblk_rl *rl, int budget);
 void pblk_rl_free(struct pblk_rl *rl);
+void pblk_rl_update_rates(struct pblk_rl *rl);
 int pblk_rl_high_thrs(struct pblk_rl *rl);
 int pblk_rl_low_thrs(struct pblk_rl *rl);
 unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
-- 
cgit v1.2.3


From 75610cd974aba4fadc9a8500d5470e8f28a3626f Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:38 +0200
Subject: lightnvm: pblk: consider bad sectors in emeta during recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When recovering lines we need to consider that bad blocks in a line
affect the emeta area size.

Previously it was assumed that the emeta area would grow by the
number of sectors per page * number of bad blocks in the line.

This assumption is not correct - the number of "extra" pages that are
consumed could be both smaller (depending on emeta size) and bigger
(depending on the placement of the bad blocks).

Fix this by calculating the emeta start by iterating backwards
through the line, skipping ppas that map to bad blocks.

Also fix the data types used for ppa indices/counts in
pblk_recov_l2p_from_emeta - we should use u64.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 44 +++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index a080cf8889821..9772a947ca4fb 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -133,16 +133,16 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_emeta *emeta = line->emeta;
 	struct line_emeta *emeta_buf = emeta->buf;
 	__le64 *lba_list;
-	int data_start, data_end;
-	int nr_valid_lbas, nr_lbas = 0;
-	int i;
+	u64 data_start, data_end;
+	u64 nr_valid_lbas, nr_lbas = 0;
+	u64 i;
 
 	lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
 	if (!lba_list)
 		return 1;
 
 	data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
-	data_end = lm->sec_per_line - lm->emeta_sec[0];
+	data_end = line->emeta_ssec;
 	nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas);
 
 	for (i = data_start; i < data_end; i++) {
@@ -172,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
 	}
 
 	if (nr_valid_lbas != nr_lbas)
-		pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
-				line->id, emeta_buf->nr_valid_lbas, nr_lbas);
+		pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n",
+				line->id, nr_valid_lbas, nr_lbas);
 
 	line->left_msecs = 0;
 
@@ -827,10 +827,32 @@ static void pblk_recov_line_add_ordered(struct list_head *head,
 	__list_add(&line->list, t->list.prev, &t->list);
 }
 
-struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	unsigned int emeta_secs;
+	u64 emeta_start;
+	struct ppa_addr ppa;
+	int pos;
+
+	emeta_secs = lm->emeta_sec[0];
+	emeta_start = lm->sec_per_line;
+
+	while (emeta_secs) {
+		emeta_start--;
+		ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
+		pos = pblk_ppa_to_pos(geo, ppa);
+		if (!test_bit(pos, line->blk_bitmap))
+			emeta_secs--;
+	}
+
+	return emeta_start;
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *line, *tline, *data_line = NULL;
@@ -930,15 +952,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 
 	/* Verify closed blocks and recover this portion of L2P table*/
 	list_for_each_entry_safe(line, tline, &recov_list, list) {
-		int off, nr_bb;
-
 		recovered_lines++;
-		/* Calculate where emeta starts based on the line bb */
-		off = lm->sec_per_line - lm->emeta_sec[0];
-		nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
-		off -= nr_bb * geo->sec_per_pl;
 
-		line->emeta_ssec = off;
+		line->emeta_ssec = pblk_line_emeta_start(pblk, line);
 		line->emeta = emeta;
 		memset(line->emeta->buf, 0, lm->emeta_len[0]);
 
-- 
cgit v1.2.3


From 1edebacf8b736774d2f160512aec721f47e1f5ac Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:39 +0200
Subject: lightnvm: pblk: shut down gc gracefully during exit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shut down the GC workqueues and tasks in the right order.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 4bac9e1531f5d..e00e5a0743e96 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -422,10 +422,15 @@ void pblk_gc_kick(struct pblk *pblk)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
-	wake_up_process(gc->gc_ts);
 	pblk_gc_writer_kick(gc);
 	pblk_gc_reader_kick(gc);
-	mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+	/* If we're shutting down GC, let's not start it up again */
+	if (gc->gc_enabled) {
+		wake_up_process(gc->gc_ts);
+		mod_timer(&gc->gc_timer,
+			  jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+	}
 }
 
 static void pblk_gc_timer(unsigned long data)
@@ -630,9 +635,6 @@ void pblk_gc_exit(struct pblk *pblk)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
-	flush_workqueue(gc->gc_reader_wq);
-	flush_workqueue(gc->gc_line_reader_wq);
-
 	gc->gc_enabled = 0;
 	del_timer_sync(&gc->gc_timer);
 	pblk_gc_stop(pblk, 1);
@@ -640,15 +642,17 @@ void pblk_gc_exit(struct pblk *pblk)
 	if (gc->gc_ts)
 		kthread_stop(gc->gc_ts);
 
+	if (gc->gc_reader_ts)
+		kthread_stop(gc->gc_reader_ts);
+
+	flush_workqueue(gc->gc_reader_wq);
 	if (gc->gc_reader_wq)
 		destroy_workqueue(gc->gc_reader_wq);
 
+	flush_workqueue(gc->gc_line_reader_wq);
 	if (gc->gc_line_reader_wq)
 		destroy_workqueue(gc->gc_line_reader_wq);
 
 	if (gc->gc_writer_ts)
 		kthread_stop(gc->gc_writer_ts);
-
-	if (gc->gc_reader_ts)
-		kthread_stop(gc->gc_reader_ts);
 }
-- 
cgit v1.2.3


From c55861926a78bf129e06bd3372b34225f4968757 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:40 +0200
Subject: lightnvm: pblk: add l2p crc debug printouts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Print the CRC of the logical-to-physical mapping during exit and
after recovering the L2P table to facilitate detection of meta
data corruption/recovery issues.

The CRC printed after recovery should match the CRC printed during
the previous exit - if it doesn't this indicates that either the meta
data written to the disk is corrupt or recovery failed.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index ad9f014a086b3..52c85f4f672de 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static size_t pblk_trans_map_size(struct pblk *pblk)
+{
+	int entry_size = 8;
+
+	if (pblk->ppaf_bitsize < 32)
+		entry_size = 4;
+
+	return entry_size * pblk->rl.nr_secs;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static u32 pblk_l2p_crc(struct pblk *pblk)
+{
+	size_t map_size;
+	u32 crc = ~(u32)0;
+
+	map_size = pblk_trans_map_size(pblk);
+	crc = crc32_le(crc, pblk->trans_map, map_size);
+	return crc;
+}
+#endif
+
 static void pblk_l2p_free(struct pblk *pblk)
 {
 	vfree(pblk->trans_map);
@@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk)
 {
 	sector_t i;
 	struct ppa_addr ppa;
-	int entry_size = 8;
-
-	if (pblk->ppaf_bitsize < 32)
-		entry_size = 4;
+	size_t map_size;
 
-	pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+	map_size = pblk_trans_map_size(pblk);
+	pblk->trans_map = vmalloc(map_size);
 	if (!pblk->trans_map)
 		return -ENOMEM;
 
@@ -508,6 +528,10 @@ static int pblk_lines_configure(struct pblk *pblk, int flags)
 		}
 	}
 
+#ifdef CONFIG_NVM_DEBUG
+	pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
 	/* Free full lines directly as GC has not been started yet */
 	pblk_gc_free_full_lines(pblk);
 
@@ -901,6 +925,11 @@ static void pblk_exit(void *private)
 	down_write(&pblk_lock);
 	pblk_gc_exit(pblk);
 	pblk_tear_down(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+	pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
 	pblk_free(pblk);
 	up_write(&pblk_lock);
 }
-- 
cgit v1.2.3


From d6b992f7ab6279884238d4e2babf100c0879b3d6 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:41 +0200
Subject: lightnvm: pblk: gc all lines in the pipeline before exit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finish garbage collect of the lines that are in the gc pipeline
before exiting. Ensure that all lines already in in the pipeline
goes through, from read to write.

Do this by keeping track of how many lines are in the pipeline
and waiting for that number to reach zero before exiting the gc
reader task.

Since we're adding a new gc line counter, change the name of
inflight_gc to read_inflight_gc to make the distinction clear.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c  |  3 +++
 drivers/lightnvm/pblk-gc.c    | 31 ++++++++++++++++++++++++-------
 drivers/lightnvm/pblk-sysfs.c |  2 +-
 drivers/lightnvm/pblk.h       |  5 ++++-
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 43866ad87586a..1cd27e38fc46a 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1465,6 +1465,7 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
 static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_gc *gc = &pblk->gc;
 
 	spin_lock(&line->lock);
 	WARN_ON(line->state != PBLK_LINESTATE_GC);
@@ -1473,6 +1474,8 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 	pblk_line_free(pblk, line);
 	spin_unlock(&line->lock);
 
+	atomic_dec(&gc->pipeline_gc);
+
 	spin_lock(&l_mg->free_lock);
 	list_add_tail(&line->list, &l_mg->free_list);
 	l_mg->nr_free_lines++;
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index e00e5a0743e96..e6fae1959e255 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -234,7 +234,7 @@ out:
 	kfree(invalid_bitmap);
 
 	kref_put(&line->ref, pblk_line_put);
-	atomic_dec(&gc->inflight_gc);
+	atomic_dec(&gc->read_inflight_gc);
 
 	return;
 
@@ -249,7 +249,7 @@ fail_free_ws:
 
 	pblk_put_line_back(pblk, line);
 	kref_put(&line->ref, pblk_line_put);
-	atomic_dec(&gc->inflight_gc);
+	atomic_dec(&gc->read_inflight_gc);
 
 	pr_err("pblk: Failed to GC line %d\n", line->id);
 }
@@ -268,6 +268,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
 	line_ws->pblk = pblk;
 	line_ws->line = line;
 
+	atomic_inc(&gc->pipeline_gc);
 	INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws);
 	queue_work(gc->gc_reader_wq, &line_ws->ws);
 
@@ -333,6 +334,7 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
 void pblk_gc_free_full_lines(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_gc *gc = &pblk->gc;
 	struct pblk_line *line;
 
 	do {
@@ -353,6 +355,7 @@ void pblk_gc_free_full_lines(struct pblk *pblk)
 		list_del(&line->list);
 		spin_unlock(&l_mg->gc_lock);
 
+		atomic_inc(&gc->pipeline_gc);
 		kref_put(&line->ref, pblk_line_put);
 	} while (1);
 }
@@ -370,12 +373,12 @@ static void pblk_gc_run(struct pblk *pblk)
 	struct pblk_line *line;
 	struct list_head *group_list;
 	bool run_gc;
-	int inflight_gc, gc_group = 0, prev_group = 0;
+	int read_inflight_gc, gc_group = 0, prev_group = 0;
 
 	pblk_gc_free_full_lines(pblk);
 
 	run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
-	if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD))
+	if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD))
 		return;
 
 next_gc_group:
@@ -402,14 +405,14 @@ next_gc_group:
 		list_add_tail(&line->list, &gc->r_list);
 		spin_unlock(&gc->r_lock);
 
-		inflight_gc = atomic_inc_return(&gc->inflight_gc);
+		read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc);
 		pblk_gc_reader_kick(gc);
 
 		prev_group = 1;
 
 		/* No need to queue up more GC lines than we can handle */
 		run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl);
-		if (!run_gc || inflight_gc >= PBLK_GC_L_QD)
+		if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD)
 			break;
 	} while (1);
 
@@ -470,6 +473,7 @@ static int pblk_gc_writer_ts(void *data)
 static int pblk_gc_reader_ts(void *data)
 {
 	struct pblk *pblk = data;
+	struct pblk_gc *gc = &pblk->gc;
 
 	while (!kthread_should_stop()) {
 		if (!pblk_gc_read(pblk))
@@ -478,6 +482,18 @@ static int pblk_gc_reader_ts(void *data)
 		io_schedule();
 	}
 
+#ifdef CONFIG_NVM_DEBUG
+	pr_info("pblk: flushing gc pipeline, %d lines left\n",
+		atomic_read(&gc->pipeline_gc));
+#endif
+
+	do {
+		if (!atomic_read(&gc->pipeline_gc))
+			break;
+
+		schedule();
+	} while (1);
+
 	return 0;
 }
 
@@ -586,7 +602,8 @@ int pblk_gc_init(struct pblk *pblk)
 	gc->gc_forced = 0;
 	gc->gc_enabled = 1;
 	gc->w_entries = 0;
-	atomic_set(&gc->inflight_gc, 0);
+	atomic_set(&gc->read_inflight_gc, 0);
+	atomic_set(&gc->pipeline_gc, 0);
 
 	/* Workqueue that reads valid sectors from a line and submit them to the
 	 * GC writer to be recycled.
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 95fb434e2f01a..cd49e8875d4ec 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
 		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
 			gc_full, gc_high, gc_mid, gc_low, gc_empty,
-			atomic_read(&pblk->gc.inflight_gc));
+			atomic_read(&pblk->gc.read_inflight_gc));
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
 		"data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 29ba7ec32b204..c6f8841973a0c 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -238,7 +238,10 @@ struct pblk_gc {
 	struct timer_list gc_timer;
 
 	struct semaphore gc_sem;
-	atomic_t inflight_gc;
+	atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */
+	atomic_t pipeline_gc;	   /* Number of lines in the GC pipeline -
+				    * started reads to finished writes
+				    */
 	int w_entries;
 
 	struct list_head w_list;
-- 
cgit v1.2.3


From 03e868eb8adb28e34f6e695667d230786bfdb653 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:42 +0200
Subject: lightnvm: pblk: correct valid lba count calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During garbage collect, lbas being written can end up
being invalidated. Make sure that this is reflected in
the valid lba count.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-map.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 3bc4c94f9cf2d..6f3ecde2140f3 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -45,6 +45,8 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	paddr = pblk_alloc_page(pblk, line, nr_secs);
 
 	for (i = 0; i < nr_secs; i++, paddr++) {
+		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
 		/* ppa to be sent to the device */
 		ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
 
@@ -61,10 +63,9 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			w_ctx->ppa = ppa_list[i];
 			meta_list[i].lba = cpu_to_le64(w_ctx->lba);
 			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
-			line->nr_valid_lbas++;
+			if (lba_list[paddr] != addr_empty)
+				line->nr_valid_lbas++;
 		} else {
-			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
-
 			lba_list[paddr] = meta_list[i].lba = addr_empty;
 			__pblk_map_invalidate(pblk, line, paddr);
 		}
-- 
cgit v1.2.3


From 28bd109411eaa4c541f2e240d1285c154de4dfb7 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:43 +0200
Subject: lightnvm: pblk: remove spinlock when freeing line metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lockdep complains about being in atomic context while freeing line
metadata - and rightly so as we take a spinlock and end up calling
vfree that might sleep(in pblk_mfree).

There is no need for holding the line manager free_lock while
freeing line metadata as the pipeline as stopped, so remove the lock.

Fixes: 588726d3ec68 ("lightnvm: pblk: fail gracefully on irrec. error")
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 52c85f4f672de..f62112ba5482a 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -393,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk)
 	kfree(l_mg->bb_aux);
 	kfree(l_mg->vsc_list);
 
-	spin_lock(&l_mg->free_lock);
 	for (i = 0; i < PBLK_DATA_LINES; i++) {
 		kfree(l_mg->sline_meta[i]);
 		pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
 		kfree(l_mg->eline_meta[i]);
 	}
-	spin_unlock(&l_mg->free_lock);
 
 	kfree(pblk->lines);
 }
-- 
cgit v1.2.3


From 8bd400204bd500bb2aea7b551f7c33bad2455340 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:44 +0200
Subject: lightnvm: pblk: cleanup unused and static functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleanup up unused and static functions across the whole codebase.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 133 ++++++++++++++++++++-----------------------
 drivers/lightnvm/pblk-gc.c   |  40 ++++++-------
 drivers/lightnvm/pblk-rl.c   |  10 ----
 drivers/lightnvm/pblk.h      |  14 ++---
 4 files changed, 86 insertions(+), 111 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 1cd27e38fc46a..4199119a0754a 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -18,6 +18,31 @@
 
 #include "pblk.h"
 
+static void pblk_line_mark_bb(struct work_struct *work)
+{
+	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+									ws);
+	struct pblk *pblk = line_ws->pblk;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct ppa_addr *ppa = line_ws->priv;
+	int ret;
+
+	ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+	if (ret) {
+		struct pblk_line *line;
+		int pos;
+
+		line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+		pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+		pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+				line->id, pos);
+	}
+
+	kfree(ppa);
+	mempool_free(line_ws, pblk->gen_ws_pool);
+}
+
 static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
 			 struct ppa_addr *ppa)
 {
@@ -268,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd)
 	complete(waiting);
 }
 
-void pblk_wait_for_meta(struct pblk *pblk)
+static void pblk_wait_for_meta(struct pblk *pblk)
 {
 	do {
 		if (!atomic_read(&pblk->inflight_io))
@@ -345,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
 	pblk_invalidate_range(pblk, slba, nr_secs);
 }
 
-struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
-{
-	struct ppa_addr ppa;
-
-	spin_lock(&pblk->trans_lock);
-	ppa = pblk_trans_map_get(pblk, lba);
-	spin_unlock(&pblk->trans_lock);
-
-	return ppa;
-}
-
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
 {
 	atomic_long_inc(&pblk->write_failed);
@@ -1338,6 +1352,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
 	pblk->state = PBLK_STATE_STOPPING;
 }
 
+static void pblk_line_close_meta_sync(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line *line, *tline;
+	LIST_HEAD(list);
+
+	spin_lock(&l_mg->close_lock);
+	if (list_empty(&l_mg->emeta_list)) {
+		spin_unlock(&l_mg->close_lock);
+		return;
+	}
+
+	list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
+	spin_unlock(&l_mg->close_lock);
+
+	list_for_each_entry_safe(line, tline, &list, list) {
+		struct pblk_emeta *emeta = line->emeta;
+
+		while (emeta->mem < lm->emeta_len[0]) {
+			int ret;
+
+			ret = pblk_submit_meta_io(pblk, line);
+			if (ret) {
+				pr_err("pblk: sync meta line %d failed (%d)\n",
+							line->id, ret);
+				return;
+			}
+		}
+	}
+
+	pblk_wait_for_meta(pblk);
+	flush_workqueue(pblk->close_wq);
+}
+
 void pblk_pipeline_stop(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1565,41 +1614,6 @@ int pblk_line_is_full(struct pblk_line *line)
 	return (line->left_msecs == 0);
 }
 
-void pblk_line_close_meta_sync(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *line, *tline;
-	LIST_HEAD(list);
-
-	spin_lock(&l_mg->close_lock);
-	if (list_empty(&l_mg->emeta_list)) {
-		spin_unlock(&l_mg->close_lock);
-		return;
-	}
-
-	list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev);
-	spin_unlock(&l_mg->close_lock);
-
-	list_for_each_entry_safe(line, tline, &list, list) {
-		struct pblk_emeta *emeta = line->emeta;
-
-		while (emeta->mem < lm->emeta_len[0]) {
-			int ret;
-
-			ret = pblk_submit_meta_io(pblk, line);
-			if (ret) {
-				pr_err("pblk: sync meta line %d failed (%d)\n",
-							line->id, ret);
-				return;
-			}
-		}
-	}
-
-	pblk_wait_for_meta(pblk);
-	flush_workqueue(pblk->close_wq);
-}
-
 static void pblk_line_should_sync_meta(struct pblk *pblk)
 {
 	if (pblk_rl_is_limit(&pblk->rl))
@@ -1673,31 +1687,6 @@ void pblk_line_close_ws(struct work_struct *work)
 	mempool_free(line_ws, pblk->gen_ws_pool);
 }
 
-void pblk_line_mark_bb(struct work_struct *work)
-{
-	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
-									ws);
-	struct pblk *pblk = line_ws->pblk;
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct ppa_addr *ppa = line_ws->priv;
-	int ret;
-
-	ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
-	if (ret) {
-		struct pblk_line *line;
-		int pos;
-
-		line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
-		pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
-
-		pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
-				line->id, pos);
-	}
-
-	kfree(ppa);
-	mempool_free(line_ws, pblk->gen_ws_pool);
-}
-
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 		      void (*work)(struct work_struct *), gfp_t gfp_mask,
 		      struct workqueue_struct *wq)
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index e6fae1959e255..b8323e34d1bcc 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -275,6 +275,26 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
 	return 0;
 }
 
+static void pblk_gc_reader_kick(struct pblk_gc *gc)
+{
+	wake_up_process(gc->gc_reader_ts);
+}
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+	struct pblk_gc *gc = &pblk->gc;
+
+	pblk_gc_writer_kick(gc);
+	pblk_gc_reader_kick(gc);
+
+	/* If we're shutting down GC, let's not start it up again */
+	if (gc->gc_enabled) {
+		wake_up_process(gc->gc_ts);
+		mod_timer(&gc->gc_timer,
+			  jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+	}
+}
+
 static int pblk_gc_read(struct pblk *pblk)
 {
 	struct pblk_gc *gc = &pblk->gc;
@@ -298,11 +318,6 @@ static int pblk_gc_read(struct pblk *pblk)
 	return 0;
 }
 
-static void pblk_gc_reader_kick(struct pblk_gc *gc)
-{
-	wake_up_process(gc->gc_reader_ts);
-}
-
 static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
 						 struct list_head *group_list)
 {
@@ -421,21 +436,6 @@ next_gc_group:
 		goto next_gc_group;
 }
 
-void pblk_gc_kick(struct pblk *pblk)
-{
-	struct pblk_gc *gc = &pblk->gc;
-
-	pblk_gc_writer_kick(gc);
-	pblk_gc_reader_kick(gc);
-
-	/* If we're shutting down GC, let's not start it up again */
-	if (gc->gc_enabled) {
-		wake_up_process(gc->gc_ts);
-		mod_timer(&gc->gc_timer,
-			  jiffies + msecs_to_jiffies(GC_TIME_MSECS));
-	}
-}
-
 static void pblk_gc_timer(unsigned long data)
 {
 	struct pblk *pblk = (struct pblk *)data;
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index 739f855d42165..abae31fd434e5 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -153,16 +153,6 @@ int pblk_rl_high_thrs(struct pblk_rl *rl)
 	return rl->high;
 }
 
-int pblk_rl_low_thrs(struct pblk_rl *rl)
-{
-	return rl->low;
-}
-
-int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
-{
-	return rl->rb_user_max;
-}
-
 int pblk_rl_max_io(struct pblk_rl *rl)
 {
 	return rl->rb_max_io;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index c6f8841973a0c..6c9ea9a93704f 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -710,8 +710,6 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type);
 void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
 int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
 			struct pblk_c_ctx *c_ctx);
-void pblk_wait_for_meta(struct pblk *pblk);
-struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
 void pblk_discard(struct pblk *pblk, struct bio *bio);
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
@@ -732,10 +730,8 @@ int pblk_line_is_full(struct pblk_line *line);
 void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
-void pblk_line_close_meta_sync(struct pblk *pblk);
 void pblk_line_close_ws(struct work_struct *work);
 void pblk_pipeline_stop(struct pblk *pblk);
-void pblk_line_mark_bb(struct work_struct *work);
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 		     void (*work)(struct work_struct *), gfp_t gfp_mask,
 		     struct workqueue_struct *wq);
@@ -759,7 +755,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		unsigned long *lun_bitmap);
-void pblk_end_bio_sync(struct bio *bio);
 void pblk_end_io_sync(struct nvm_rq *rqd);
 int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 		       int nr_pages);
@@ -834,7 +829,6 @@ void pblk_gc_exit(struct pblk *pblk);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
 void pblk_gc_should_kick(struct pblk *pblk);
-void pblk_gc_kick(struct pblk *pblk);
 void pblk_gc_free_full_lines(struct pblk *pblk);
 void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
 			      int *gc_active);
@@ -847,7 +841,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget);
 void pblk_rl_free(struct pblk_rl *rl);
 void pblk_rl_update_rates(struct pblk_rl *rl);
 int pblk_rl_high_thrs(struct pblk_rl *rl);
-int pblk_rl_low_thrs(struct pblk_rl *rl);
 unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
 int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
@@ -855,11 +848,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
 int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
 void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
-int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
 int pblk_rl_max_io(struct pblk_rl *rl);
 void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
 void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
-void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left);
 int pblk_rl_is_limit(struct pblk_rl *rl);
 
 /*
@@ -868,6 +859,11 @@ int pblk_rl_is_limit(struct pblk_rl *rl);
 int pblk_sysfs_init(struct gendisk *tdisk);
 void pblk_sysfs_exit(struct gendisk *tdisk);
 
+static inline void test(size_t a)
+{
+	a += 1;
+}
+
 static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
 {
 	if (type == PBLK_KMALLOC_META)
-- 
cgit v1.2.3


From 8da10cce7c7f7f9f5edc77271cf6e0c45b762004 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:45 +0200
Subject: lightnvm: pblk: avoid being reported as hung on rated GC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The amount of GC I/O on the write buffer is managed by the rate-limiter,
which is calculated as a function of the number of available free
blocks. When reaching the stable point, we risk having scheduled more
I/Os for GC than are allowed on the write buffer. This would result on
the GC semaphore balancing the outstanding read GC I/Os to be reported
as "hung", though the behavior is normal.

Solve this by allowing to schedule when we detect that the read GC path
is not moving forward.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index b8323e34d1bcc..00d5698d64a9a 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -218,7 +218,13 @@ next_rq:
 	gc_rq_ws->line = line;
 	gc_rq_ws->priv = gc_rq;
 
-	down(&gc->gc_sem);
+	/* The write GC path can be much slower than the read GC one due to
+	 * the budget imposed by the rate-limiter. Balance in case that we get
+	 * back pressure from the write GC path.
+	 */
+	while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000)))
+		io_schedule();
+
 	kref_get(&line->ref);
 
 	INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws);
-- 
cgit v1.2.3


From 1b839187db62aae22a9254fd9e3ce376149b0918 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:46 +0200
Subject: lightnvm: fail fast on passthrough commands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make LightNVM passhtrough commands fail fast. User space will then take
care of re-submitting.

Fixes: 84d4add793c6 ('lightnvm: add ioctls for vector I/Os')
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/lightnvm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 1f79e3f141e64..6017153c24394 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -600,8 +600,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 
 	rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 
-	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
-
 	if (ppa_buf && ppa_len) {
 		ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma);
 		if (!ppa_list) {
-- 
cgit v1.2.3


From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001
From: Javier González <javier@cnexlabs.com>
Date: Fri, 13 Oct 2017 14:46:47 +0200
Subject: lightnvm: implement generic path for sync I/O
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement a generic path for sending sync I/O on LightNVM. This allows
to reuse the standard synchronous path trough blk_execute_rq(), instead
of implementing a wait_for_completion on the target side (e.g., pblk).

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c          | 25 +++++++++-----
 drivers/lightnvm/pblk-core.c     | 74 ++++++++++++----------------------------
 drivers/lightnvm/pblk-read.c     | 21 ++----------
 drivers/lightnvm/pblk-recovery.c | 31 ++---------------
 drivers/lightnvm/pblk.h          | 42 +++++++++++++++++++++--
 drivers/nvme/host/lightnvm.c     | 70 +++++++++++++++++++++++++++++--------
 include/linux/lightnvm.h         |  3 ++
 7 files changed, 143 insertions(+), 123 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 0e5f77234c79a..fe21f4dd33e91 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -720,12 +720,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-static void nvm_end_io_sync(struct nvm_rq *rqd)
+int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
-	struct completion *waiting = rqd->private;
+	struct nvm_dev *dev = tgt_dev->parent;
+	int ret;
+
+	if (!dev->ops->submit_io_sync)
+		return -ENODEV;
+
+	nvm_rq_tgt_to_dev(tgt_dev, rqd);
 
-	complete(waiting);
+	rqd->dev = tgt_dev;
+
+	/* In case of error, fail with right address format */
+	ret = dev->ops->submit_io_sync(dev, rqd);
+	nvm_rq_dev_to_tgt(tgt_dev, rqd);
+
+	return ret;
 }
+EXPORT_SYMBOL(nvm_submit_io_sync);
 
 int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 								int nr_ppas)
@@ -733,25 +746,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	struct nvm_geo *geo = &tgt_dev->geo;
 	struct nvm_rq rqd;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 	rqd.opcode = NVM_OP_ERASE;
-	rqd.end_io = nvm_end_io_sync;
-	rqd.private = &wait;
 	rqd.flags = geo->plane_mode >> 1;
 
 	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
 	if (ret)
 		return ret;
 
-	ret = nvm_submit_io(tgt_dev, &rqd);
+	ret = nvm_submit_io_sync(tgt_dev, &rqd);
 	if (ret) {
 		pr_err("rrpr: erase I/O submission failed: %d\n", ret);
 		goto free_ppa_list;
 	}
-	wait_for_completion_io(&wait);
 
 free_ppa_list:
 	nvm_free_rqd_ppalist(tgt_dev, &rqd);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 4199119a0754a..ce90213a42fae 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -412,39 +412,33 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
 	struct nvm_tgt_dev *dev = pblk->dev;
 
 #ifdef CONFIG_NVM_DEBUG
-	struct ppa_addr *ppa_list;
+	int ret;
 
-	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
-	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
+	ret = pblk_check_io(pblk, rqd);
+	if (ret)
+		return ret;
+#endif
 
-	if (rqd->opcode == NVM_OP_PWRITE) {
-		struct pblk_line *line;
-		struct ppa_addr ppa;
-		int i;
+	atomic_inc(&pblk->inflight_io);
 
-		for (i = 0; i < rqd->nr_ppas; i++) {
-			ppa = ppa_list[i];
-			line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+	return nvm_submit_io(dev, rqd);
+}
 
-			spin_lock(&line->lock);
-			if (line->state != PBLK_LINESTATE_OPEN) {
-				pr_err("pblk: bad ppa: line:%d,state:%d\n",
-							line->id, line->state);
-				WARN_ON(1);
-				spin_unlock(&line->lock);
-				return -EINVAL;
-			}
-			spin_unlock(&line->lock);
-		}
-	}
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+	int ret;
+
+	ret = pblk_check_io(pblk, rqd);
+	if (ret)
+		return ret;
 #endif
 
 	atomic_inc(&pblk->inflight_io);
 
-	return nvm_submit_io(dev, rqd);
+	return nvm_submit_io_sync(dev, rqd);
 }
 
 static void pblk_bio_map_addr_endio(struct bio *bio)
@@ -597,7 +591,6 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
 	int cmd_op, bio_op;
 	int i, j;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
@@ -639,8 +632,6 @@ next_rq:
 	rqd.dma_ppa_list = dma_ppa_list;
 	rqd.opcode = cmd_op;
 	rqd.nr_ppas = rq_ppas;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 
 	if (dir == PBLK_WRITE) {
 		struct pblk_sec_meta *meta_list = rqd.meta_list;
@@ -694,19 +685,14 @@ next_rq:
 		}
 	}
 
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		pr_err("pblk: emeta I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		goto free_rqd_dma;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: emeta I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	if (rqd.error) {
 		if (dir == PBLK_WRITE)
@@ -750,7 +736,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	int i, ret;
 	int cmd_op, bio_op;
 	int flags;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	if (dir == PBLK_WRITE) {
 		bio_op = REQ_OP_WRITE;
@@ -787,8 +772,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	rqd.opcode = cmd_op;
 	rqd.flags = flags;
 	rqd.nr_ppas = lm->smeta_sec;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 
 	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
 		struct pblk_sec_meta *meta_list = rqd.meta_list;
@@ -807,17 +790,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	 * the write thread is the only one sending write and erase commands,
 	 * there is no need to take the LUN semaphore.
 	 */
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		pr_err("pblk: smeta I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		goto free_ppa_list;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: smeta I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
@@ -861,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
 {
 	struct nvm_rq rqd;
 	int ret = 0;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 	pblk_setup_e_rq(pblk, &rqd, ppa);
 
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
-
 	/* The write thread schedules erases so that it minimizes disturbances
 	 * with writes. Thus, there is no need to take the LUN semaphore.
 	 */
-	ret = pblk_submit_io(pblk, &rqd);
+	ret = pblk_submit_io_sync(pblk, &rqd);
 	if (ret) {
 		struct nvm_tgt_dev *dev = pblk->dev;
 		struct nvm_geo *geo = &dev->geo;
@@ -886,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
 		goto out;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: sync erase timed out\n");
-	}
-
 out:
 	rqd.private = pblk;
 	__pblk_end_io_erase(pblk, &rqd);
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 71c58503f1a45..ca79d8fb3e607 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -207,7 +207,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	int nr_secs = rqd->nr_ppas;
 	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
 	int i, ret, hole;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	/* Re-use allocated memory for intermediate lbas */
 	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
@@ -232,8 +231,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->bio = new_bio;
 	rqd->nr_ppas = nr_holes;
 	rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (unlikely(nr_holes == 1)) {
 		ppa_ptr = rqd->ppa_list;
@@ -241,18 +238,13 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		rqd->ppa_addr = rqd->ppa_list[0];
 	}
 
-	ret = pblk_submit_read_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		bio_put(rqd->bio);
-		pr_err("pblk: read IO submission failed\n");
+		pr_err("pblk: sync read IO submission failed\n");
 		goto err;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: partial read I/O timed out\n");
-	}
-
 	if (rqd->error) {
 		atomic_long_inc(&pblk->read_failed);
 #ifdef CONFIG_NVM_DEBUG
@@ -537,7 +529,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	struct nvm_rq rqd;
 	int data_len;
 	int ret = NVM_IO_OK;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
@@ -577,22 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 
 	rqd.opcode = NVM_OP_PREAD;
-	rqd.end_io = pblk_end_io_sync;
-	rqd.private = &wait;
 	rqd.nr_ppas = gc_rq->secs_to_gc;
 	rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
 	rqd.bio = bio;
 
-	if (pblk_submit_read_io(pblk, &rqd)) {
+	if (pblk_submit_io_sync(pblk, &rqd)) {
 		ret = -EIO;
 		pr_err("pblk: GC read request failed\n");
 		goto err_free_bio;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: GC read I/O timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 9772a947ca4fb..eadb3eb5d4dc8 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -216,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
 	int rq_ppas, rq_len;
 	int i, j;
 	int ret = 0;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -253,8 +252,6 @@ next_read_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -280,19 +277,13 @@ next_read_rq:
 	}
 
 	/* If read fails, more padding is needed */
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-		return -EINTR;
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* At this point, the read should not fail. If it does, it is a problem
 	 * we cannot recover from here. Need FTL log.
@@ -504,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
 	int ret = 0;
 	int rec_round;
 	int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -539,8 +529,6 @@ next_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -566,18 +554,13 @@ next_rq:
 				addr_to_gen_ppa(pblk, w_ptr, line->id);
 	}
 
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* This should not happen since the read failed during normal recovery,
 	 * but the media works funny sometimes...
@@ -645,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	int i, j;
 	int ret = 0;
 	int left_ppas = pblk_calc_sec_in_line(pblk, line);
-	DECLARE_COMPLETION_ONSTACK(wait);
 
 	ppa_list = p.ppa_list;
 	meta_list = p.meta_list;
@@ -678,8 +660,6 @@ next_rq:
 	rqd->ppa_list = ppa_list;
 	rqd->dma_ppa_list = dma_ppa_list;
 	rqd->dma_meta_list = dma_meta_list;
-	rqd->end_io = pblk_end_io_sync;
-	rqd->private = &wait;
 
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -705,19 +685,14 @@ next_rq:
 				addr_to_gen_ppa(pblk, paddr, line->id);
 	}
 
-	ret = pblk_submit_io(pblk, rqd);
+	ret = pblk_submit_io_sync(pblk, rqd);
 	if (ret) {
 		pr_err("pblk: I/O submission failed: %d\n", ret);
 		bio_put(bio);
 		return ret;
 	}
 
-	if (!wait_for_completion_io_timeout(&wait,
-				msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
-		pr_err("pblk: L2P recovery read timed out\n");
-	}
 	atomic_dec(&pblk->inflight_io);
-	reinit_completion(&wait);
 
 	/* Reached the end of the written line */
 	if (rqd->error) {
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 6c9ea9a93704f..6b64288de6f76 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -714,6 +714,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio);
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
 struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 			      unsigned int nr_secs, unsigned int len,
@@ -1203,7 +1204,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
 
 	pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
 }
-#endif
 
 static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 				       struct ppa_addr *ppas, int nr_ppas)
@@ -1224,14 +1224,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 				ppa->g.sec < geo->sec_per_pg)
 			continue;
 
-#ifdef CONFIG_NVM_DEBUG
 		print_ppa(ppa, "boundary", i);
-#endif
+
 		return 1;
 	}
 	return 0;
 }
 
+static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct ppa_addr *ppa_list;
+
+	ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+
+	if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (rqd->opcode == NVM_OP_PWRITE) {
+		struct pblk_line *line;
+		struct ppa_addr ppa;
+		int i;
+
+		for (i = 0; i < rqd->nr_ppas; i++) {
+			ppa = ppa_list[i];
+			line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+			spin_lock(&line->lock);
+			if (line->state != PBLK_LINESTATE_OPEN) {
+				pr_err("pblk: bad ppa: line:%d,state:%d\n",
+							line->id, line->state);
+				WARN_ON(1);
+				spin_unlock(&line->lock);
+				return -EINVAL;
+			}
+			spin_unlock(&line->lock);
+		}
+	}
+
+	return 0;
+}
+#endif
+
 static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 6017153c24394..8fc949c5b49b7 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
 	blk_mq_free_request(rq);
 }
 
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static struct request *nvme_nvm_alloc_request(struct request_queue *q,
+					      struct nvm_rq *rqd,
+					      struct nvme_nvm_command *cmd)
 {
-	struct request_queue *q = dev->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct request *rq;
-	struct bio *bio = rqd->bio;
-	struct nvme_nvm_command *cmd;
-
-	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
-	if (!cmd)
-		return -ENOMEM;
 
 	nvme_nvm_rqtocmd(rqd, ns, cmd);
 
 	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
-	if (IS_ERR(rq)) {
-		kfree(cmd);
-		return PTR_ERR(rq);
-	}
+	if (IS_ERR(rq))
+		return rq;
+
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-	if (bio) {
-		blk_init_request_from_bio(rq, bio);
+	if (rqd->bio) {
+		blk_init_request_from_bio(rq, rqd->bio);
 	} else {
 		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 		rq->__data_len = 0;
 	}
 
+	return rq;
+}
+
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct nvme_nvm_command *cmd;
+	struct request *rq;
+
+	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	rq = nvme_nvm_alloc_request(q, rqd, cmd);
+	if (IS_ERR(rq)) {
+		kfree(cmd);
+		return PTR_ERR(rq);
+	}
+
 	rq->end_io_data = rqd;
 
 	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
@@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return 0;
 }
 
+static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+	struct request_queue *q = dev->q;
+	struct request *rq;
+	struct nvme_nvm_command cmd;
+	int ret = 0;
+
+	memset(&cmd, 0, sizeof(struct nvme_nvm_command));
+
+	rq = nvme_nvm_alloc_request(q, rqd, &cmd);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	/* I/Os can fail and the error is signaled through rqd. Callers must
+	 * handle the error accordingly.
+	 */
+	blk_execute_rq(q, NULL, rq, 0);
+	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+
+	rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
+	rqd->error = nvme_req(rq)->status;
+
+	blk_mq_free_request(rq);
+
+	return ret;
+}
+
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 	.submit_io		= nvme_nvm_submit_io,
+	.submit_io_sync		= nvme_nvm_submit_io_sync,
 
 	.create_dma_pool	= nvme_nvm_create_dma_pool,
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 4f0e4a0fd2047..b7f111ff4d3b3 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,6 +56,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
+typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -69,6 +70,7 @@ struct nvm_dev_ops {
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
 	nvm_submit_io_fn	*submit_io;
+	nvm_submit_io_sync_fn	*submit_io_sync;
 
 	nvm_create_dma_pool_fn	*create_dma_pool;
 	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
@@ -477,6 +479,7 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
+extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-- 
cgit v1.2.3


From cdd094fd0ad750c94ccaa5b2ee84fd8264f6f433 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Oct 2017 09:36:06 -0600
Subject: Revert "lightnvm: prevent bd removal if busy"

Christoph correctly points out that this issue is no different
for other block devices, and poking at cross layer internals
is not the right way to solve it.

This reverts commit bb6aa6f08268bbce4e0185b18cab9e04505d6695.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index fe21f4dd33e91..83249b43dd069 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -390,7 +390,6 @@ static void __nvm_remove_target(struct nvm_target *t)
 static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 {
 	struct nvm_target *t;
-	struct block_device *bdev;
 
 	mutex_lock(&dev->mlock);
 	t = nvm_find_target(dev, remove->tgtname);
@@ -398,19 +397,6 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 		mutex_unlock(&dev->mlock);
 		return 1;
 	}
-	bdev = bdget_disk(t->disk, 0);
-	if (!bdev) {
-		pr_err("nvm: removal failed, allocating bd failed\n");
-		mutex_unlock(&dev->mlock);
-		return -ENOMEM;
-	}
-	if (bdev->bd_super || bdev->bd_part_count) {
-		pr_err("nvm: removal failed, block device busy\n");
-		bdput(bdev);
-		mutex_unlock(&dev->mlock);
-		return -EBUSY;
-	}
-	bdput(bdev);
 	__nvm_remove_target(t);
 	mutex_unlock(&dev->mlock);
 
-- 
cgit v1.2.3


From 23c4490d2b7f78a855f9c5675e26735fb61afe84 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Sat, 14 Oct 2017 00:26:28 +0800
Subject: null_blk: update usage hints for submit_queues

update the range of submits_queues, and correct usage hints.

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/null_blk.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 3140dbd860d8c..1f8d92c704585 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -55,10 +55,10 @@ irqmode=[0-2]: Default: 1-Soft-irq
 completion_nsec=[ns]: Default: 10.000ns
   Combined with irqmode=2 (timer). The time each completion event must wait.
 
-submit_queues=[0..nr_cpus]:
+submit_queues=[1..nr_cpus]:
   The number of submission queues attached to the device driver. If unset, it
-  defaults to 1 on single-queue and bio-based instances. For multi-queue,
-  it is ignored when use_per_node_hctx module parameter is 1.
+  defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
+  parameter is 1.
 
 hw_queue_depth=[0..qdepth]: Default: 64
   The hardware queue depth of the device.
-- 
cgit v1.2.3


From fc186311f2233bb3f69f22b96babdf16a57c88a9 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Sat, 14 Oct 2017 00:26:54 +0800
Subject: null_blk: add usage hints for no_sched

This parameter provide an option to disable io scheduler when nullb*
in multi-queue mode.

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/null_blk.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 1f8d92c704585..b7161c8e70f2e 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -73,3 +73,7 @@ use_per_node_hctx=[0/1]: Default: 0
 
 use_lightnvm=[0/1]: Default: 0
   Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
+
+no_sched=[0/1]: Default: 0
+  0: nullb* use default blk-mq io scheduler.
+  1: nullb* doesn't use io scheduler.
-- 
cgit v1.2.3


From 515c24c13c7ff1262cdb40fe631c6391e99c0996 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sat, 14 Oct 2017 16:38:27 +0800
Subject: mm/page-writeback.c: make changes of dirty_writeback_centisecs take
 effect immediately

This patch is the followup of the prvious patch:
[writeback: schedule periodic writeback with sysctl].

There's another issue to fix.
For example,
- When the tunable was set to one hour and is reset to one second, the
  new setting will not take effect for up to one hour.

Kicking the flusher threads immediately fixes it.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/page-writeback.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 622a18c114caa..c518c845f2025 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1976,7 +1976,16 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 	int ret;
 
 	ret = proc_dointvec(table, write, buffer, length, ppos);
-	if (!ret && !old_interval && dirty_writeback_interval)
+
+	/*
+	 * Writing 0 to dirty_writeback_interval will disable periodic writeback
+	 * and a different non-zero value will wakeup the writeback threads.
+	 * wb_wakeup_delayed() would be more appropriate, but it's a pain to
+	 * iterate over all bdis and wbs.
+	 * The reason we do this is to make the change take effect immediately.
+	 */
+	if (!ret && write && dirty_writeback_interval &&
+		dirty_writeback_interval != old_interval)
 		wakeup_flusher_threads(WB_REASON_PERIODIC);
 
 	return ret;
-- 
cgit v1.2.3


From 761f2e1ed8822cf12378c857de337290d0c82a81 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2017 18:46:46 +0200
Subject: nvme: simplify compat_ioctl handling

We can just use our normal ioctl handler for the compat case and remove
the boilerplate code for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
---
 drivers/nvme/host/core.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 26c8913435b25..573cc3b59bfad 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1052,16 +1052,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
-#ifdef CONFIG_COMPAT
-static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned int cmd, unsigned long arg)
-{
-	return nvme_ioctl(bdev, mode, cmd, arg);
-}
-#else
-#define nvme_compat_ioctl	NULL
-#endif
-
 static int nvme_open(struct block_device *bdev, fmode_t mode)
 {
 	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
@@ -1380,7 +1370,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
-	.compat_ioctl	= nvme_compat_ioctl,
+	.compat_ioctl	= nvme_ioctl,
 	.open		= nvme_open,
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
-- 
cgit v1.2.3


From 58f913dce2814a9ea7260e93ed3a949e0d5565e3 Mon Sep 17 00:00:00 2001
From: Peter Foley <pefoley2@pefoley.com>
Date: Fri, 13 Oct 2017 16:35:28 -0700
Subject: bcache: Avoid nested function definition

Fixes below error with clang:
../drivers/md/bcache/sysfs.c:759:3: error: function definition is not allowed here
                {       return *((uint16_t *) r) - *((uint16_t *) l); }
                ^
../drivers/md/bcache/sysfs.c:789:32: error: use of undeclared identifier 'cmp'
                sort(p, n, sizeof(uint16_t), cmp, NULL);
                                             ^
2 errors generated.

v2:
rename function to __bch_cache_cmp

Signed-off-by: Peter Foley <pefoley2@pefoley.com>
Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 104c57cd666ce..69f355b9650ca 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -745,6 +745,11 @@ static struct attribute *bch_cache_set_internal_files[] = {
 };
 KTYPE(bch_cache_set_internal);
 
+static int __bch_cache_cmp(const void *l, const void *r)
+{
+	return *((uint16_t *)r) - *((uint16_t *)l);
+}
+
 SHOW(__bch_cache)
 {
 	struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -769,9 +774,6 @@ SHOW(__bch_cache)
 					       CACHE_REPLACEMENT(&ca->sb));
 
 	if (attr == &sysfs_priority_stats) {
-		int cmp(const void *l, const void *r)
-		{	return *((uint16_t *) r) - *((uint16_t *) l); }
-
 		struct bucket *b;
 		size_t n = ca->sb.nbuckets, i;
 		size_t unused = 0, available = 0, dirty = 0, meta = 0;
@@ -800,7 +802,7 @@ SHOW(__bch_cache)
 			p[i] = ca->buckets[i].prio;
 		mutex_unlock(&ca->set->bucket_lock);
 
-		sort(p, n, sizeof(uint16_t), cmp, NULL);
+		sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
 
 		while (n &&
 		       !cached[n - 1])
-- 
cgit v1.2.3


From 91af8300d9c1d7c6b6a2fd754109e08d4798b8d8 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 13 Oct 2017 16:35:29 -0700
Subject: bcache: check ca->alloc_thread initialized before wake up it

In bcache code, sysfs entries are created before all resources get
allocated, e.g. allocation thread of a cache set.

There is posibility for NULL pointer deference if a resource is accessed
but which is not initialized yet. Indeed Jorg Bornschein catches one on
cache set allocation thread and gets a kernel oops.

The reason for this bug is, when bch_bucket_alloc() is called during
cache set registration and attaching, ca->alloc_thread is not properly
allocated and initialized yet, call wake_up_process() on ca->alloc_thread
triggers NULL pointer deference failure. A simple and fast fix is, before
waking up ca->alloc_thread, checking whether it is allocated, and only
wake up ca->alloc_thread when it is not NULL.

Signed-off-by: Coly Li <colyli@suse.de>
Reported-by: Jorg Bornschein <jb@capsec.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: stable@vger.kernel.org
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index cacbe2dbd5c31..e1a4205caa2e1 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -406,7 +406,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 
 	finish_wait(&ca->set->bucket_wait, &w);
 out:
-	wake_up_process(ca->alloc_thread);
+	if (ca->alloc_thread)
+		wake_up_process(ca->alloc_thread);
 
 	trace_bcache_alloc(ca, reserve);
 
-- 
cgit v1.2.3


From b1e8139e48b58e3bc1234e619c750ffd1394be2f Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 13 Oct 2017 16:35:30 -0700
Subject: bcache: fix a comments typo in bch_alloc_sectors()

Code comments in alloc.c:bch_alloc_sectors() mentions a function
name find_data_bucket(), the correct function name should be
pick_data_bucket() indeed. bch_alloc_sectors() is a quite important
function in bcache allocation code, fixing the typo may help
other people to have less confusion.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e1a4205caa2e1..4c40870e99f5a 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -601,7 +601,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
 
 	/*
 	 * If we had to allocate, we might race and not need to allocate the
-	 * second time we call find_data_bucket(). If we allocated a bucket but
+	 * second time we call pick_data_bucket(). If we allocated a bucket but
 	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 	 */
 	if (KEY_PTRS(&alloc.key))
-- 
cgit v1.2.3


From 1dbe32ad0a82f39c6dfb7667c5da5c23b9333664 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 13 Oct 2017 16:35:31 -0700
Subject: bcache: rewrite multiple partitions support

Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
  3 bcache devices, they name will be,
  /dev/bcache0, /dev/bcache16, /dev/bcache32
  Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
  One bcache device will occupy 16 bits, this is not a good idea. Indeed
  only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
  is allocated by ida_simple_get(), but an first minor number is sent into
  ida_simple_remove() to release the device. It confused original author
  too.

Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
  instances.
- Device driver does not allocat nor trace device minor number, only
  provides a first minor number of a given device instance, and sets how
  many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.

This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
  bcache device with 128 partitions, only one bit in ida bit string is
  enough.
- Device minor number and device index are separated in concept. Device
  index is used for /dev node naming, and ida bit string trace. Minor
  number is calculated from device index and only used to initialize
  first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
  This patch sets 128 partitions on single bcache device at max, this is
  the limitation from GPT (GUID Partition Table) and supported by fdisk.

Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.

[minor spelling fixes in commit message by Michael Lyle]

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fc0a31b13ac42..a478d1ac0480c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets);
 static LIST_HEAD(uncached_devices);
 
 static int bcache_major;
-static DEFINE_IDA(bcache_minor);
+static DEFINE_IDA(bcache_device_idx);
 static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
 #define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS		16 /* partition support */
+/* limitation of partitions number on single bcache device */
+#define BCACHE_MINORS		128
+/* limitation of bcache devices number on single system */
+#define BCACHE_DEVICE_IDX_MAX	((1U << MINORBITS)/BCACHE_MINORS)
 
 /* Superblock */
 
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
 	closure_get(&c->caching);
 }
 
+static inline int first_minor_to_idx(int first_minor)
+{
+	return (first_minor/BCACHE_MINORS);
+}
+
+static inline int idx_to_first_minor(int idx)
+{
+	return (idx * BCACHE_MINORS);
+}
+
 static void bcache_device_free(struct bcache_device *d)
 {
 	lockdep_assert_held(&bch_register_lock);
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d)
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
 	if (d->disk) {
-		ida_simple_remove(&bcache_minor, d->disk->first_minor);
+		ida_simple_remove(&bcache_device_idx,
+				  first_minor_to_idx(d->disk->first_minor));
 		put_disk(d->disk);
 	}
 
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 {
 	struct request_queue *q;
 	size_t n;
-	int minor;
+	int idx;
 
 	if (!d->stripe_size)
 		d->stripe_size = 1 << 31;
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
-	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
-	if (minor < 0)
-		return minor;
-
-	minor *= BCACHE_MINORS;
+	idx = ida_simple_get(&bcache_device_idx, 0,
+				BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+	if (idx < 0)
+		return idx;
 
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
 					   BIOSET_NEED_BVECS |
 					   BIOSET_NEED_RESCUER)) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
-		ida_simple_remove(&bcache_minor, minor);
+		ida_simple_remove(&bcache_device_idx, idx);
 		return -ENOMEM;
 	}
 
 	set_capacity(d->disk, sectors);
-	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= minor;
+	d->disk->first_minor	= idx_to_first_minor(idx);
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
-- 
cgit v1.2.3


From e89d67596e202119ea846cc997f4cf75cb284490 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Fri, 13 Oct 2017 16:35:32 -0700
Subject: bcache: Remove redundant set_capacity

set_capacity() has been called in bcache_device_init(),
remove the redundant one.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a478d1ac0480c..72c3b2929ef02 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1142,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 	if (ret)
 		return ret;
 
-	set_capacity(dc->disk.disk,
-		     dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
-
 	dc->disk.disk->queue->backing_dev_info->ra_pages =
 		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
 		    q->backing_dev_info->ra_pages);
-- 
cgit v1.2.3


From b41c9b0266e8370033a7799f6806bfc70b7fd75f Mon Sep 17 00:00:00 2001
From: Eric Wheeler <bcache@linux.ewheeler.net>
Date: Fri, 13 Oct 2017 16:35:33 -0700
Subject: bcache: update bio->bi_opf bypass/writeback REQ_ flag hints

Flag for bypass if the IO is for read-ahead or background, unless the
read-ahead request is for metadata (eg, from gfs2).
        Bypass if:
                bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
			!(bio->bi_opf & REQ_META))

        Writeback if:
                op_is_sync(bio->bi_opf) ||
			bio->bi_opf & (REQ_META|REQ_PRIO)

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c   | 8 ++++++++
 drivers/md/bcache/writeback.h | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 681b4f12b05a0..9ee137e8d387b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -384,6 +384,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/*
+	 * Flag for bypass if the IO is for read-ahead or background,
+	 * unless the read-ahead request is for metadata (eg, for gfs2).
+	 */
+	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+	    !(bio->bi_opf & REQ_META))
+		goto skip;
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index e35421d20d2eb..34bcf49d737bd 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -76,7 +76,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
-	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+	return (op_is_sync(bio->bi_opf) ||
+		bio->bi_opf & (REQ_META|REQ_PRIO) ||
+		in_use <= CUTOFF_WRITEBACK);
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
cgit v1.2.3


From 238501027abf0386fa5f5dcaf589f406eb187bc3 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Fri, 13 Oct 2017 16:35:34 -0700
Subject: bcache: remove unused parameter

Parameter bio is no longer used, clean it.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9ee137e8d387b..163a17a808741 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -26,12 +26,12 @@ struct kmem_cache *bch_search_cache;
 
 static void bch_data_insert_start(struct closure *);
 
-static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+static unsigned cache_mode(struct cached_dev *dc)
 {
 	return BDEV_CACHE_MODE(&dc->sb);
 }
 
-static bool verify(struct cached_dev *dc, struct bio *bio)
+static bool verify(struct cached_dev *dc)
 {
 	return dc->verify;
 }
@@ -369,7 +369,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 {
 	struct cache_set *c = dc->disk.c;
-	unsigned mode = cache_mode(dc, bio);
+	unsigned mode = cache_mode(dc);
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
@@ -747,7 +747,7 @@ static void cached_dev_read_done(struct closure *cl)
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+	if (verify(dc) && s->recoverable && !s->read_dirty_data)
 		bch_data_verify(dc, s->orig_bio);
 
 	bio_complete(s);
@@ -772,7 +772,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
 
 	if (s->iop.status)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-	else if (s->iop.bio || verify(dc, &s->bio.bio))
+	else if (s->iop.bio || verify(dc))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -899,7 +899,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		s->iop.bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
-			     cache_mode(dc, bio),
+			     cache_mode(dc),
 			     s->iop.bypass)) {
 		s->iop.bypass = false;
 		s->iop.writeback = true;
-- 
cgit v1.2.3


From 5fa89fb9a86bcc0f0b3f21ab6087a8a4170dcd2c Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:35 -0700
Subject: bcache: don't write back data if reading it failed

If an IO operation fails, and we didn't successfully read data from the
cache, don't writeback invalid/partial data to the backing disk.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/writeback.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index e663ca082183e..5e65a392287d0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -179,13 +179,21 @@ static void write_dirty(struct closure *cl)
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 
-	dirty_init(w);
-	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
-	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
-	bio_set_dev(&io->bio, io->dc->bdev);
-	io->bio.bi_end_io	= dirty_endio;
+	/*
+	 * IO errors are signalled using the dirty bit on the key.
+	 * If we failed to read, we should not attempt to write to the
+	 * backing device.  Instead, immediately go to write_dirty_finish
+	 * to clean up.
+	 */
+	if (KEY_DIRTY(&w->key)) {
+		dirty_init(w);
+		bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+		bio_set_dev(&io->bio, io->dc->bdev);
+		io->bio.bi_end_io	= dirty_endio;
 
-	closure_bio_submit(&io->bio, cl);
+		closure_bio_submit(&io->bio, cl);
+	}
 
 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
-- 
cgit v1.2.3


From 1d316e658374f700fdfff9299c70ce65d8d145e6 Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:36 -0700
Subject: bcache: implement PI controller for writeback rate

bcache uses a control system to attempt to keep the amount of dirty data
in cache at a user-configured level, while not responding excessively to
transients and variations in write rate.  Previously, the system was a
PD controller; but the output from it was integrated, turning the
Proportional term into an Integral term, and turning the Derivative term
into a crude Proportional term.  Performance of the controller has been
uneven in production, and it has tended to respond slowly, oscillate,
and overshoot.

This patch set replaces the current control system with an explicit PI
controller and tuning that should be correct for most hardware.  By
default, it attempts to write at a rate that would retire 1/40th of the
current excess blocks per second.  An integral term in turn works to
remove steady state errors.

IMO, this yields benefits in simplicity (removing weighted average
filtering, etc) and system performance.

Another small change is a tunable parameter is introduced to allow the
user to specify a minimum rate at which dirty blocks are retired.

There is a slight difference from earlier versions of the patch in
integral handling to prevent excessive negative integral windup.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    |  9 ++---
 drivers/md/bcache/sysfs.c     | 18 +++++----
 drivers/md/bcache/writeback.c | 91 ++++++++++++++++++++++++-------------------
 3 files changed, 66 insertions(+), 52 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2ed9bd231d84b..eb83be693d60b 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -265,9 +265,6 @@ struct bcache_device {
 	atomic_t		*stripe_sectors_dirty;
 	unsigned long		*full_dirty_stripes;
 
-	unsigned long		sectors_dirty_last;
-	long			sectors_dirty_derivative;
-
 	struct bio_set		*bio_split;
 
 	unsigned		data_csum:1;
@@ -362,12 +359,14 @@ struct cached_dev {
 
 	uint64_t		writeback_rate_target;
 	int64_t			writeback_rate_proportional;
-	int64_t			writeback_rate_derivative;
+	int64_t			writeback_rate_integral;
+	int64_t			writeback_rate_integral_scaled;
 	int64_t			writeback_rate_change;
 
 	unsigned		writeback_rate_update_seconds;
-	unsigned		writeback_rate_d_term;
+	unsigned		writeback_rate_i_term_inverse;
 	unsigned		writeback_rate_p_term_inverse;
+	unsigned		writeback_rate_minimum;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 69f355b9650ca..2290bffd49228 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
 rw_attribute(writeback_rate);
 
 rw_attribute(writeback_rate_update_seconds);
-rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_i_term_inverse);
 rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_minimum);
 read_attribute(writeback_rate_debug);
 
 read_attribute(stripe_size);
@@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
 
 	var_print(writeback_rate_update_seconds);
-	var_print(writeback_rate_d_term);
+	var_print(writeback_rate_i_term_inverse);
 	var_print(writeback_rate_p_term_inverse);
+	var_print(writeback_rate_minimum);
 
 	if (attr == &sysfs_writeback_rate_debug) {
 		char rate[20];
 		char dirty[20];
 		char target[20];
 		char proportional[20];
-		char derivative[20];
+		char integral[20];
 		char change[20];
 		s64 next_io;
 
@@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
 		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
 		bch_hprint(target,	dc->writeback_rate_target << 9);
 		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
-		bch_hprint(derivative,	dc->writeback_rate_derivative << 9);
+		bch_hprint(integral,	dc->writeback_rate_integral_scaled << 9);
 		bch_hprint(change,	dc->writeback_rate_change << 9);
 
 		next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
 			       "dirty:\t\t%s\n"
 			       "target:\t\t%s\n"
 			       "proportional:\t%s\n"
-			       "derivative:\t%s\n"
+			       "integral:\t%s\n"
 			       "change:\t\t%s/sec\n"
 			       "next io:\t%llims\n",
 			       rate, dirty, target, proportional,
-			       derivative, change, next_io);
+			       integral, change, next_io);
 	}
 
 	sysfs_hprint(dirty_data,
@@ -213,7 +215,7 @@ STORE(__cached_dev)
 			    dc->writeback_rate.rate, 1, INT_MAX);
 
 	d_strtoul_nonzero(writeback_rate_update_seconds);
-	d_strtoul(writeback_rate_d_term);
+	d_strtoul(writeback_rate_i_term_inverse);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
 	d_strtoi_h(sequential_cutoff);
@@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_writeback_percent,
 	&sysfs_writeback_rate,
 	&sysfs_writeback_rate_update_seconds,
-	&sysfs_writeback_rate_d_term,
+	&sysfs_writeback_rate_i_term_inverse,
 	&sysfs_writeback_rate_p_term_inverse,
 	&sysfs_writeback_rate_debug,
 	&sysfs_dirty_data,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 5e65a392287d0..cac8678da5d06 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -25,48 +25,62 @@ static void __update_writeback_rate(struct cached_dev *dc)
 				bcache_flash_devs_sectors_dirty(c);
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
-
 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
 				   c->cached_dev_sectors);
 
-	/* PD controller */
-
+	/*
+	 * PI controller:
+	 * Figures out the amount that should be written per second.
+	 *
+	 * First, the error (number of sectors that are dirty beyond our
+	 * target) is calculated.  The error is accumulated (numerically
+	 * integrated).
+	 *
+	 * Then, the proportional value and integral value are scaled
+	 * based on configured values.  These are stored as inverses to
+	 * avoid fixed point math and to make configuration easy-- e.g.
+	 * the default value of 40 for writeback_rate_p_term_inverse
+	 * attempts to write at a rate that would retire all the dirty
+	 * blocks in 40 seconds.
+	 *
+	 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+	 * of the error is accumulated in the integral term per second.
+	 * This acts as a slow, long-term average that is not subject to
+	 * variations in usage like the p term.
+	 */
 	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
-	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
-	int64_t proportional = dirty - target;
-	int64_t change;
-
-	dc->disk.sectors_dirty_last = dirty;
-
-	/* Scale to sectors per second */
-
-	proportional *= dc->writeback_rate_update_seconds;
-	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-
-	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-
-	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
-			      (dc->writeback_rate_d_term /
-			       dc->writeback_rate_update_seconds) ?: 1, 0);
-
-	derivative *= dc->writeback_rate_d_term;
-	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-
-	change = proportional + derivative;
+	int64_t error = dirty - target;
+	int64_t proportional_scaled =
+		div_s64(error, dc->writeback_rate_p_term_inverse);
+	int64_t integral_scaled, new_rate;
+
+	if ((error < 0 && dc->writeback_rate_integral > 0) ||
+	    (error > 0 && time_before64(local_clock(),
+			 dc->writeback_rate.next + NSEC_PER_MSEC))) {
+		/*
+		 * Only decrease the integral term if it's more than
+		 * zero.  Only increase the integral term if the device
+		 * is keeping up.  (Don't wind up the integral
+		 * ineffectively in either case).
+		 *
+		 * It's necessary to scale this by
+		 * writeback_rate_update_seconds to keep the integral
+		 * term dimensioned properly.
+		 */
+		dc->writeback_rate_integral += error *
+			dc->writeback_rate_update_seconds;
+	}
 
-	/* Don't increase writeback rate if the device isn't keeping up */
-	if (change > 0 &&
-	    time_after64(local_clock(),
-			 dc->writeback_rate.next + NSEC_PER_MSEC))
-		change = 0;
+	integral_scaled = div_s64(dc->writeback_rate_integral,
+			dc->writeback_rate_i_term_inverse);
 
-	dc->writeback_rate.rate =
-		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
-			1, NSEC_PER_MSEC);
+	new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
+			dc->writeback_rate_minimum, NSEC_PER_MSEC);
 
-	dc->writeback_rate_proportional = proportional;
-	dc->writeback_rate_derivative = derivative;
-	dc->writeback_rate_change = change;
+	dc->writeback_rate_proportional = proportional_scaled;
+	dc->writeback_rate_integral_scaled = integral_scaled;
+	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+	dc->writeback_rate.rate = new_rate;
 	dc->writeback_rate_target = target;
 }
 
@@ -499,8 +513,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
 
 	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
-
-	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -514,10 +526,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
+	dc->writeback_rate_minimum	= 1;
 
 	dc->writeback_rate_update_seconds = 5;
-	dc->writeback_rate_d_term	= 30;
-	dc->writeback_rate_p_term_inverse = 6000;
+	dc->writeback_rate_p_term_inverse = 40;
+	dc->writeback_rate_i_term_inverse = 10000;
 
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
-- 
cgit v1.2.3


From ae82ddbfeb359fcffa97be5fb5bcd59165f2864f Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:37 -0700
Subject: bcache: smooth writeback rate control

This works in conjunction with the new PI controller.  Currently, in
real-world workloads, the rate controller attempts to write back 1
sector per second.  In practice, these minimum-rate writebacks are
between 4k and 60k in test scenarios, since bcache aggregates and
attempts to do contiguous writes and because filesystems on top of
bcachefs typically write 4k or more.

Previously, bcache used to guarantee to write at least once per second.
This means that the actual writeback rate would exceed the configured
amount by a factor of 8-120 or more.

This patch adjusts to be willing to sleep up to 2.5 seconds, and to
target writing 4k/second.  On the smallest writes, it will sleep 1
second like before, but many times it will sleep longer and load the
backing device less.  This keeps the loading on the cache and backing
device related to writeback more consistent when writing back at low
rates.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/util.c      | 10 ++++++++--
 drivers/md/bcache/writeback.c |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 176d3c2ef5f5c..4dbe37e828778 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 
 	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
 
-	if (time_before64(now + NSEC_PER_SEC, d->next))
-		d->next = now + NSEC_PER_SEC;
+	/* Bound the time.  Don't let us fall further than 2 seconds behind
+	 * (this prevents unnecessary backlog that would make it impossible
+	 * to catch up).  If we're ahead of the desired writeback rate,
+	 * don't let us sleep more than 2.5 seconds (so we can notice/respond
+	 * if the control system tells us to speed up!).
+	 */
+	if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next))
+		d->next = now + NSEC_PER_SEC * 5 / 2;
 
 	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
 		d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index cac8678da5d06..8deb721c355e9 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -526,7 +526,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
-	dc->writeback_rate_minimum	= 1;
+	dc->writeback_rate_minimum	= 8;
 
 	dc->writeback_rate_update_seconds = 5;
 	dc->writeback_rate_p_term_inverse = 40;
-- 
cgit v1.2.3


From e41166c5c44e30dbd620f7c77a27efe5d5cc551a Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:38 -0700
Subject: bcache: writeback rate shouldn't artifically clamp

The previous code artificially limited writeback rate to 1000000
blocks/second (NSEC_PER_MSEC), which is a rate that can be met on fast
hardware.  The rate limiting code works fine (though with decreased
precision) up to 3 orders of magnitude faster, so use NSEC_PER_SEC.

Additionally, ensure that uint32_t is used as a type for rate throughout
the rate management so that type checking/clamp_t can work properly.

bch_next_delay should be rewritten for increased precision and better
handling of high rates and long sleep periods, but this is adequate for
now.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reported-by: Coly Li <colyli@suse.de>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    | 2 +-
 drivers/md/bcache/util.h      | 4 ++--
 drivers/md/bcache/writeback.c | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index eb83be693d60b..d77c4829c4972 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -361,7 +361,7 @@ struct cached_dev {
 	int64_t			writeback_rate_proportional;
 	int64_t			writeback_rate_integral;
 	int64_t			writeback_rate_integral_scaled;
-	int64_t			writeback_rate_change;
+	int32_t			writeback_rate_change;
 
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_i_term_inverse;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cb8d2ccbb6c6e..8f509290bb02e 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -441,10 +441,10 @@ struct bch_ratelimit {
 	uint64_t		next;
 
 	/*
-	 * Rate at which we want to do work, in units per nanosecond
+	 * Rate at which we want to do work, in units per second
 	 * The units here correspond to the units passed to bch_next_delay()
 	 */
-	unsigned		rate;
+	uint32_t		rate;
 };
 
 static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 8deb721c355e9..897d28050656b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -52,7 +52,8 @@ static void __update_writeback_rate(struct cached_dev *dc)
 	int64_t error = dirty - target;
 	int64_t proportional_scaled =
 		div_s64(error, dc->writeback_rate_p_term_inverse);
-	int64_t integral_scaled, new_rate;
+	int64_t integral_scaled;
+	uint32_t new_rate;
 
 	if ((error < 0 && dc->writeback_rate_integral > 0) ||
 	    (error > 0 && time_before64(local_clock(),
@@ -74,8 +75,8 @@ static void __update_writeback_rate(struct cached_dev *dc)
 	integral_scaled = div_s64(dc->writeback_rate_integral,
 			dc->writeback_rate_i_term_inverse);
 
-	new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
-			dc->writeback_rate_minimum, NSEC_PER_MSEC);
+	new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
+			dc->writeback_rate_minimum, NSEC_PER_SEC);
 
 	dc->writeback_rate_proportional = proportional_scaled;
 	dc->writeback_rate_integral_scaled = integral_scaled;
-- 
cgit v1.2.3


From a8500fc816b19795756d27c762daa5e19f5e1b6f Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:39 -0700
Subject: bcache: rearrange writeback main thread ratelimit

The time spent searching for things to write back "counts" for the
actual rate achieved, so don't flush the accumulated rate with each
chunk.

This will maintain better fidelity to user-commanded rates, but it
may slightly increase the burstiness of writeback.  The writeback
lock needs improvement to help mitigate this.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/writeback.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 897d28050656b..9b770b13bdf62 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -440,6 +440,8 @@ static int bch_writeback_thread(void *arg)
 	struct cached_dev *dc = arg;
 	bool searched_full_index;
 
+	bch_ratelimit_reset(&dc->writeback_rate);
+
 	while (!kthread_should_stop()) {
 		down_write(&dc->writeback_lock);
 		if (!atomic_read(&dc->has_dirty) ||
@@ -467,7 +469,6 @@ static int bch_writeback_thread(void *arg)
 
 		up_write(&dc->writeback_lock);
 
-		bch_ratelimit_reset(&dc->writeback_rate);
 		read_dirty(dc);
 
 		if (searched_full_index) {
@@ -477,6 +478,8 @@ static int bch_writeback_thread(void *arg)
 			       !kthread_should_stop() &&
 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 				delay = schedule_timeout_interruptible(delay);
+
+			bch_ratelimit_reset(&dc->writeback_rate);
 		}
 	}
 
-- 
cgit v1.2.3


From 6446c684f9418d0175c9c3e5134e7744fe79181a Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Fri, 13 Oct 2017 16:35:40 -0700
Subject: bcache: safeguard a dangerous addressing in closure_queue

The use of the union reduces the size of closure struct by taking advantage
of the current size of its members. The offset of func in work_struct
equals the size of the first three members, so that work.work_func will
just reference the forth member - fn.

This is smart but dangerous. It can be broken if work_struct or the other
structs get changed, and can be a bit difficult to debug.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/closure.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 295b7e43f92c2..00fb314cce57c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -251,6 +251,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 static inline void closure_queue(struct closure *cl)
 {
 	struct workqueue_struct *wq = cl->wq;
+	/**
+	 * Changes made to closure, work_struct, or a couple of other structs
+	 * may cause work.func not pointing to the right location.
+	 */
+	BUILD_BUG_ON(offsetof(struct closure, fn)
+		     != offsetof(struct work_struct, func));
 	if (wq) {
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
-- 
cgit v1.2.3


From 77c77a98f8a4dd4d33810e25a8780f65d6fa00e9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 13 Oct 2017 16:35:41 -0700
Subject: bcache: Add Michael Lyle to MAINTAINERS

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fcd..4654cf5004990 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2562,6 +2562,7 @@ S:	Maintained
 F:	drivers/net/hamradio/baycom*
 
 BCACHE (BLOCK LAYER CACHE)
+M:	Michael Lyle <mlyle@lyle.org>
 M:	Kent Overstreet <kent.overstreet@gmail.com>
 L:	linux-bcache@vger.kernel.org
 W:	http://bcache.evilpiepirate.org
-- 
cgit v1.2.3


From 52b69ff54aa0d163253dbd7ff80aaecf179dfbe3 Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Fri, 13 Oct 2017 16:35:42 -0700
Subject: bcache: MAINTAINERS: set bcache to MAINTAINED

Also add URL for IRC channel.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4654cf5004990..c0bbcf1c81884 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2566,7 +2566,8 @@ M:	Michael Lyle <mlyle@lyle.org>
 M:	Kent Overstreet <kent.overstreet@gmail.com>
 L:	linux-bcache@vger.kernel.org
 W:	http://bcache.evilpiepirate.org
-S:	Orphan
+C:	irc://irc.oftc.net/bcache
+S:	Maintained
 F:	drivers/md/bcache/
 
 BDISP ST MEDIA DRIVER
-- 
cgit v1.2.3


From 9ce762e85bc95fd7aa1fb5f8cfc38ce5a228fc95 Mon Sep 17 00:00:00 2001
From: Michael Lyle <mlyle@lyle.org>
Date: Mon, 16 Oct 2017 10:34:47 -0700
Subject: bcache: writeback rate clamping: make 32 bit safe

Sorry this got through to linux-block, was detected by the kbuilds test
robot.  NSEC_PER_SEC is a long constant; 2.5 * 10^9 doesn't fit in a
signed long constant.

Fixes: e41166c5c44e ("bcache: writeback rate shouldn't artifically clamp")
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 4dbe37e828778..e548b8b513227 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -238,8 +238,8 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 	 * don't let us sleep more than 2.5 seconds (so we can notice/respond
 	 * if the control system tells us to speed up!).
 	 */
-	if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next))
-		d->next = now + NSEC_PER_SEC * 5 / 2;
+	if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next))
+		d->next = now + NSEC_PER_SEC * 5LLU / 2LLU;
 
 	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
 		d->next = now - NSEC_PER_SEC * 2;
-- 
cgit v1.2.3


From 519c8e9ffd86143fedd84cf833a09f36b47d0f5c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 16 Oct 2017 11:01:00 -0700
Subject: block: fix Sphinx kernel-doc warning

Sphinx treats symbols that end with '_' as a kind of special
documentation indicator, so fix that by adding an ending '*'
to it.

../block/bio.c:404: ERROR: Unknown target name: "gfp".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index bf0dbe8f78f8e..ae9ad34e6a71f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 
 /**
  * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask:   the GFP_ mask given to the slab allocator
+ * @gfp_mask:   the GFP_* mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
  * @bs:		the bio_set to allocate from.
  *
-- 
cgit v1.2.3


From 30c516d750396c5f3ec9cb04c9e025c25e91495e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 17 Oct 2017 12:11:46 +0000
Subject: nullb: fix error return code in null_init()

Fix to return error code -ENOMEM from the null_alloc_dev() error
handling case instead of 0, as done elsewhere in this function.

Fixes: 2984c8684f96 ("nullb: factor disk parameters")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index bf2c8ca3242aa..50c83c4b2ea02 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1991,8 +1991,10 @@ static int __init null_init(void)
 
 	for (i = 0; i < nr_devices; i++) {
 		dev = null_alloc_dev();
-		if (!dev)
+		if (!dev) {
+			ret = -ENOMEM;
 			goto err_dev;
+		}
 		ret = null_add_dev(dev);
 		if (ret) {
 			null_free_dev(dev);
-- 
cgit v1.2.3


From 8cf466602028196b939255f1eb4e9817efd1db6d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 11 Oct 2017 10:39:15 -0700
Subject: kyber: fix hang on domain token wait queue

When we're getting a domain token, if we fail to get a token on our
first attempt, we put the current hardware queue on a wait queue and
then try again just in case a token was freed after our initial attempt
but before we got on the wait queue. If this second attempt succeeds, we
currently leave the hardware queue on the wait queue. Usually this is
okay; we'll just run the hardware queue one extra time when another
token is freed. However, if the hardware queue doesn't have any other
requests waiting, then when it it gets the extra wakeup, it won't have
anything to free and therefore won't wake up any other hardware queues.
If tokens are limited, then we won't make forward progress and the
device will hang.

Reported-by: Bin Zha <zhabin.zb@alibaba-inc.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/kyber-iosched.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index f58cab82105ba..db5bfc6342d34 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -541,9 +541,17 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 
 		/*
 		 * Try again in case a token was freed before we got on the wait
-		 * queue.
+		 * queue. The waker may have already removed the entry from the
+		 * wait queue, but list_del_init() is okay with that.
 		 */
 		nr = __sbitmap_queue_get(domain_tokens);
+		if (nr >= 0) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&ws->wait.lock, flags);
+			list_del_init(&wait->entry);
+			spin_unlock_irqrestore(&ws->wait.lock, flags);
+		}
 	}
 	return nr;
 }
-- 
cgit v1.2.3


From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:06 +0300
Subject: block: introduce blk_mq_tagset_iter

Iterator helper to apply a function on all the
tags in a given tagset. export it as it will be used
outside the block layer later on.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 16 +++++++++++-----
 include/linux/blk-mq.h |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6714507aa6c75..bce1c76fc768d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -298,12 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+			 int (fn)(void *, struct request *))
 {
 	int i, j, ret = 0;
 
-	if (WARN_ON_ONCE(!reinit_request))
+	if (WARN_ON_ONCE(!fn))
 		goto out;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
@@ -316,8 +316,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			if (!tags->static_rqs[j])
 				continue;
 
-			ret = reinit_request(set->driver_data,
-					     tags->static_rqs[j]);
+			ret = fn(data, tags->static_rqs[j]);
 			if (ret)
 				goto out;
 		}
@@ -326,6 +325,13 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
+
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
+			 int (reinit_request)(void *, struct request *))
+{
+	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
+}
 EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
 
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f5..6bc29f73a9aa4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
+int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
+		int (reinit_request)(void *, struct request *));
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
 			 int (reinit_request)(void *, struct request *));
 
-- 
cgit v1.2.3


From 31b8446079757575e576b0516f0e4c0fcdfbd3dd Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:07 +0300
Subject: nvme: introduce nvme_reinit_tagset

Move blk_mq_reinit_tagset from blk-mq to nvme core
as the only user of it. Current transports that use
it (rdma, fc) simply implement .reinit_request op.

This patch does not change any functionality.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 10 ++++++++++
 drivers/nvme/host/fc.c   |  3 ++-
 drivers/nvme/host/nvme.h |  2 ++
 drivers/nvme/host/rdma.c |  7 +++----
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 573cc3b59bfad..c95155696741b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2934,6 +2934,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
+{
+	if (!ctrl->ops->reinit_request)
+		return 0;
+
+	return blk_mq_tagset_iter(set, set->driver_data,
+			ctrl->ops->reinit_request);
+}
+EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
+
 int __init nvme_core_init(void)
 {
 	int result;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b8e0822127c1a..372a0685b12a6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2393,7 +2393,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
 
 	nvme_fc_init_io_queues(ctrl);
 
-	ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request);
+	ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
 	if (ret)
 		goto out_free_io_queues;
 
@@ -2753,6 +2753,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
 	.submit_async_event	= nvme_fc_submit_async_event,
 	.delete_ctrl		= nvme_fc_del_nvme_ctrl,
 	.get_address		= nvmf_get_address,
+	.reinit_request		= nvme_fc_reinit_request,
 };
 
 static void
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index df787f39f4c1e..cb9d93048f3db 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -237,6 +237,7 @@ struct nvme_ctrl_ops {
 	void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
 	int (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+	int (*reinit_request)(void *data, struct request *rq);
 };
 
 static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
@@ -311,6 +312,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
 void nvme_start_freeze(struct nvme_ctrl *ctrl);
+int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
 
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 92a03ff5fb4d4..039f38cb6987c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -774,8 +774,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 			goto out_free_tagset;
 		}
 	} else {
-		error = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
-					     nvme_rdma_reinit_request);
+		error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
 		if (error)
 			goto out_free_queue;
 	}
@@ -855,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 			goto out_free_tag_set;
 		}
 	} else {
-		ret = blk_mq_reinit_tagset(&ctrl->tag_set,
-					   nvme_rdma_reinit_request);
+		ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
 		if (ret)
 			goto out_free_io_queues;
 
@@ -1845,6 +1843,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.submit_async_event	= nvme_rdma_submit_async_event,
 	.delete_ctrl		= nvme_rdma_del_ctrl,
 	.get_address		= nvmf_get_address,
+	.reinit_request		= nvme_rdma_reinit_request,
 };
 
 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
-- 
cgit v1.2.3


From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:53:08 +0300
Subject: block: remove blk_mq_reinit_tagset

No callers left.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     | 7 -------
 include/linux/blk-mq.h | 2 --
 2 files changed, 9 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index bce1c76fc768d..c81b40ecd3f11 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -327,13 +327,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blk_mq_tagset_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *))
-{
-	return blk_mq_tagset_iter(set, set->driver_data, reinit_request);
-}
-EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset);
-
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6bc29f73a9aa4..cfd64e500d82c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data,
 		int (reinit_request)(void *, struct request *));
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-			 int (reinit_request)(void *, struct request *));
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3


From 60070c78ef7b624680ffdde30bca55e515504585 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:06 +0300
Subject: nvme-rdma: pass tagset to directly nvme_rdma_free_tagset

Instead of flagging admin/io.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 039f38cb6987c..93c8578a2ddc2 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -670,11 +670,10 @@ out_free_queues:
 	return ret;
 }
 
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin)
+static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
+		struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-	struct blk_mq_tag_set *set = admin ?
-			&ctrl->admin_tag_set : &ctrl->tag_set;
 
 	blk_mq_free_tag_set(set);
 	nvme_rdma_dev_put(ctrl->device);
@@ -744,7 +743,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	if (remove) {
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
-		nvme_rdma_free_tagset(&ctrl->ctrl, true);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
 	}
 	nvme_rdma_free_queue(&ctrl->queues[0]);
 }
@@ -818,7 +817,7 @@ out_cleanup_queue:
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
 out_free_tagset:
 	if (new)
-		nvme_rdma_free_tagset(&ctrl->ctrl, true);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
 out_free_queue:
 	nvme_rdma_free_queue(&ctrl->queues[0]);
 	return error;
@@ -830,7 +829,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
 	nvme_rdma_stop_io_queues(ctrl);
 	if (remove) {
 		blk_cleanup_queue(ctrl->ctrl.connect_q);
-		nvme_rdma_free_tagset(&ctrl->ctrl, false);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
 	}
 	nvme_rdma_free_io_queues(ctrl);
 }
@@ -873,7 +872,7 @@ out_cleanup_connect_q:
 		blk_cleanup_queue(ctrl->ctrl.connect_q);
 out_free_tag_set:
 	if (new)
-		nvme_rdma_free_tagset(&ctrl->ctrl, false);
+		nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
 out_free_io_queues:
 	nvme_rdma_free_io_queues(ctrl);
 	return ret;
-- 
cgit v1.2.3


From d8bfceebc41762f7dde960996a24180f9a67cd51 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:07 +0300
Subject: nvme-rdma: fix wrong logging message

Not necessarily address resolution failed.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 93c8578a2ddc2..02b3388af308c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -544,7 +544,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 	ret = nvme_rdma_wait_for_cm(queue);
 	if (ret) {
 		dev_info(ctrl->ctrl.device,
-			"rdma_resolve_addr wait failed (%d).\n", ret);
+			"rdma connection establishment failed (%d)\n", ret);
 		goto out_destroy_cm_id;
 	}
 
-- 
cgit v1.2.3


From 0c5b43b9c1fafc5af37915dde6670aaf1224e8cc Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:08 +0300
Subject: nvme-rdma: move assignment to declaration

No need for the extra line for trivial assignments.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 02b3388af308c..221ace23aaa5e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -434,11 +434,9 @@ out_err:
 
 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 {
-	struct nvme_rdma_device *dev;
-	struct ib_device *ibdev;
+	struct nvme_rdma_device *dev = queue->device;
+	struct ib_device *ibdev = dev->dev;
 
-	dev = queue->device;
-	ibdev = dev->dev;
 	rdma_destroy_qp(queue->cm_id);
 	ib_free_cq(queue->ib_cq);
 
-- 
cgit v1.2.3


From 0fc176dfdafc78e66bd74f54e487ca1fc59d01bf Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:09 +0300
Subject: nvme-rdma: Check that reinit_request got a proper mr

Warn if req->mr is NULL as it should never happen.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 221ace23aaa5e..7e61e6447d2cc 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -274,6 +274,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 	int ret = 0;
 
+	if (WARN_ON_ONCE(!req->mr))
+		return 0;
+
 	ib_dereg_mr(req->mr);
 
 	req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
-- 
cgit v1.2.3


From 5e1fe61d4170b1f498e20de92c7ce4cd5e40c3c5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:11 +0300
Subject: nvme-rdma: teardown admin/io queues once on error recovery

Relying on the queue state while tearing down on every reconnect
attempt is not a good design. We should do it once in err_work
and simply try to establish the queues for each reconnect attempt.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 7e61e6447d2cc..95837c5317b41 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -925,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 
 	++ctrl->ctrl.nr_reconnects;
 
-	if (ctrl->ctrl.queue_count > 1)
-		nvme_rdma_destroy_io_queues(ctrl, false);
-
-	nvme_rdma_destroy_admin_queue(ctrl, false);
 	ret = nvme_rdma_configure_admin_queue(ctrl, false);
 	if (ret)
 		goto requeue;
@@ -936,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	if (ctrl->ctrl.queue_count > 1) {
 		ret = nvme_rdma_configure_io_queues(ctrl, false);
 		if (ret)
-			goto requeue;
+			goto destroy_admin;
 	}
 
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
@@ -946,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	ctrl->ctrl.nr_reconnects = 0;
-
 	nvme_start_ctrl(&ctrl->ctrl);
 
-	dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
+	dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
+			ctrl->ctrl.nr_reconnects);
+
+	ctrl->ctrl.nr_reconnects = 0;
 
 	return;
 
+destroy_admin:
+	nvme_rdma_destroy_admin_queue(ctrl, false);
 requeue:
 	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
 			ctrl->ctrl.nr_reconnects);
@@ -969,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_stop_queues(&ctrl->ctrl);
-		nvme_rdma_stop_io_queues(ctrl);
-	}
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
-	nvme_rdma_stop_queue(&ctrl->queues[0]);
-
-	/* We must take care of fastfail/requeue all our inflight requests */
-	if (ctrl->ctrl.queue_count > 1)
 		blk_mq_tagset_busy_iter(&ctrl->tag_set,
 					nvme_cancel_request, &ctrl->ctrl);
+		nvme_rdma_destroy_io_queues(ctrl, false);
+	}
+
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_cancel_request, &ctrl->ctrl);
+	nvme_rdma_destroy_admin_queue(ctrl, false);
 
 	/*
 	 * queues are not a live anymore, so restart the queues to fail fast
-- 
cgit v1.2.3


From 60a518863368c7e124cd8411c2481d0938f53e08 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:10 +0300
Subject: nvme-rdma: Don't local invalidate if the queue is not live

No chance for the local invalidate to succeed if the queue-pair
is in error state. Most likely the target will do a remote
invalidation of our mr so not a big loss on the test_bit.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 95837c5317b41..a58dd0c77a295 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1052,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
 	if (!blk_rq_bytes(rq))
 		return;
 
-	if (req->mr->need_inval) {
+	if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) {
 		res = nvme_rdma_inv_rkey(queue, req);
 		if (unlikely(res < 0)) {
 			dev_err(ctrl->ctrl.device,
-- 
cgit v1.2.3


From 5013e98b5e8db50144e8f1ca5a96aed95d4d48a0 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 15:29:12 +0300
Subject: nvme-rdma: change queue flag semantics DELETING -> ALLOCATED

Instead of marking we are deleting, mark we are allocated and check that
instead. This makes the logic symmetrical to connected mark check.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a58dd0c77a295..ff67375982fe8 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -79,8 +79,8 @@ struct nvme_rdma_request {
 };
 
 enum nvme_rdma_queue_flags {
-	NVME_RDMA_Q_LIVE		= 0,
-	NVME_RDMA_Q_DELETING		= 1,
+	NVME_RDMA_Q_ALLOCATED		= 0,
+	NVME_RDMA_Q_LIVE		= 1,
 };
 
 struct nvme_rdma_queue {
@@ -549,7 +549,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 		goto out_destroy_cm_id;
 	}
 
-	clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
+	set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
 
 	return 0;
 
@@ -569,7 +569,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 
 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 {
-	if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
+	if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 		return;
 
 	nvme_rdma_destroy_queue_ib(queue);
-- 
cgit v1.2.3


From 0ad0bfa29822904b9186a1c04dbfb3014bc5dc8a Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 11 Oct 2017 12:49:51 +0300
Subject: nvme-rdma: stop controller reset if the controller is deleting

If the controller is deleting (in case the user decided to delete it), we
have no point to continue reset sequence.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ff67375982fe8..34d22ff42afba 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1818,7 +1818,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	}
 
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-	WARN_ON_ONCE(!changed);
+	if (!changed) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
+		return;
+	}
 
 	nvme_start_ctrl(&ctrl->ctrl);
 
-- 
cgit v1.2.3


From 16772ae6d9221b065607335a9b279375d8d3e2fd Mon Sep 17 00:00:00 2001
From: Minwoo Im <dn3108@gmail.com>
Date: Wed, 18 Oct 2017 22:56:09 +0900
Subject: nvme-pci: fix typos in comments

fixed comment typos in adapter_alloc_cq() and adapter_alloc_sq().
'the the' duplications are replaced with 'that the'.

Signed-off-by: Minwoo Im <dn3108@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index cb73bc8cad3bd..bafdc2ab5be3c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -930,7 +930,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
 	/*
-	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * Note: we (ab)use the fact that the prp fields survive if no data
 	 * is attached to the request.
 	 */
 	memset(&c, 0, sizeof(c));
@@ -951,7 +951,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	int flags = NVME_QUEUE_PHYS_CONTIG;
 
 	/*
-	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * Note: we (ab)use the fact that the prp fields survive if no data
 	 * is attached to the request.
 	 */
 	memset(&c, 0, sizeof(c));
-- 
cgit v1.2.3


From 94f29d4f7808e3a495fda8c5d337fc4ac4037ce7 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Wed, 18 Oct 2017 12:38:24 +0000
Subject: nvme-rdma: Add BLK_MQ_F_NO_SCHED flag to admin tag set

Since commit b86dd81
"block: get rid of blk-mq default scheduler choice Kconfig entries",
when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler.
This flag is useful for admin queues that aren't used for normal IO.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 34d22ff42afba..a5577e06a6204 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -699,6 +699,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = 1;
 		set->timeout = ADMIN_TIMEOUT;
+		set->flags = BLK_MQ_F_NO_SCHED;
 	} else {
 		set = &ctrl->tag_set;
 		memset(set, 0, sizeof(*set));
-- 
cgit v1.2.3


From 5a22e2bf44fb311e5a6bfec6ed5680199d6e162a Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Wed, 18 Oct 2017 12:38:25 +0000
Subject: nvme-fc: Add BLK_MQ_F_NO_SCHED flag to admin tag set

Since commit b86dd81
"block: get rid of blk-mq default scheduler choice Kconfig entries",
when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler.
This flag is useful for admin queues that aren't used for normal IO.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: James Smart  <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 372a0685b12a6..0d0e898912add 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2853,6 +2853,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->admin_tag_set.driver_data = ctrl;
 	ctrl->admin_tag_set.nr_hw_queues = 1;
 	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+	ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
 
 	ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
 	if (ret)
-- 
cgit v1.2.3


From 86f36b9c6c6cda28c86ead09323f55018e5e64df Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Wed, 18 Oct 2017 12:38:26 +0000
Subject: nvme-loop: Add BLK_MQ_F_NO_SCHED flag to admin tag set

This flag is useful for admin queues that aren't used for normal IO.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 92628c4329262..c56354e1e4c61 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -365,6 +365,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 	ctrl->admin_tag_set.driver_data = ctrl;
 	ctrl->admin_tag_set.nr_hw_queues = 1;
 	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+	ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
 
 	ctrl->queues[0].ctrl = ctrl;
 	error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
-- 
cgit v1.2.3


From ba2dec35e431e62e11116b8036dc372493169b39 Mon Sep 17 00:00:00 2001
From: Roy Shterman <roys@lightbitslabs.com>
Date: Wed, 18 Oct 2017 13:46:07 +0300
Subject: nvmet: Change max_nsid in subsystem due to ns_disable if needed

In case we disable namespaces which has the nsid like
subsystem max_nsid we need to search for the next largest nsid
in this subsystem. If the subsystem don't has more namespaces
we set it to 0, else we take nsid from the last namespace in
namespaces list because the list is sorted while inserting.

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
[hch: slight refactor]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 1b208beeef509..da088293eafc1 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 	return 0;
 }
 
+static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
+{
+	struct nvmet_ns *ns;
+
+	if (list_empty(&subsys->namespaces))
+		return 0;
+
+	ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
+	return ns->nsid;
+}
+
 static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 {
 	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
@@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 
 	ns->enabled = false;
 	list_del_rcu(&ns->dev_link);
+	if (ns->nsid == subsys->max_nsid)
+		subsys->max_nsid = nvmet_max_nsid(subsys);
 	mutex_unlock(&subsys->lock);
 
 	/*
-- 
cgit v1.2.3


From 9843f685ae365d0c628e6a0d929772bca7274311 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 13:10:01 +0200
Subject: nvme: use ida_simple_{get,remove} for the controller instance

Switch to the ida_simple_* helpers instead of opencoding them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/nvme/host/core.c | 40 +++++++---------------------------------
 1 file changed, 7 insertions(+), 33 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c95155696741b..7fae42d595d5d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -74,6 +74,8 @@ EXPORT_SYMBOL_GPL(nvme_wq);
 static LIST_HEAD(nvme_ctrl_list);
 static DEFINE_SPINLOCK(dev_list_lock);
 
+static DEFINE_IDA(nvme_instance_ida);
+
 static struct class *nvme_class;
 
 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
@@ -2696,35 +2698,6 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
 
-static DEFINE_IDA(nvme_instance_ida);
-
-static int nvme_set_instance(struct nvme_ctrl *ctrl)
-{
-	int instance, error;
-
-	do {
-		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
-			return -ENODEV;
-
-		spin_lock(&dev_list_lock);
-		error = ida_get_new(&nvme_instance_ida, &instance);
-		spin_unlock(&dev_list_lock);
-	} while (error == -EAGAIN);
-
-	if (error)
-		return -ENODEV;
-
-	ctrl->instance = instance;
-	return 0;
-}
-
-static void nvme_release_instance(struct nvme_ctrl *ctrl)
-{
-	spin_lock(&dev_list_lock);
-	ida_remove(&nvme_instance_ida, ctrl->instance);
-	spin_unlock(&dev_list_lock);
-}
-
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_stop_keep_alive(ctrl);
@@ -2762,7 +2735,7 @@ static void nvme_free_ctrl(struct kref *kref)
 	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
 
 	put_device(ctrl->device);
-	nvme_release_instance(ctrl);
+	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	ida_destroy(&ctrl->ns_ida);
 
 	ctrl->ops->free_ctrl(ctrl);
@@ -2796,9 +2769,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
 
-	ret = nvme_set_instance(ctrl);
-	if (ret)
+	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
 		goto out;
+	ctrl->instance = ret;
 
 	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
 				MKDEV(nvme_char_major, ctrl->instance),
@@ -2825,7 +2799,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	return 0;
 out_release_instance:
-	nvme_release_instance(ctrl);
+	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From a7a7cbe353a52665b8463e1822ce6ba46b0609d6 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 16 Oct 2017 18:24:20 -0700
Subject: nvme-pci: add SGL support

This adds SGL support for NVMe PCIe driver, based on an earlier patch
from Rajiv Shanmugam Madeswaran <smrajiv15 at gmail.com>. This patch
refactors the original code and adds new module parameter sgl_threshold
to determine whether to use SGL or PRP for IOs.

The usage of SGLs is controlled by the sgl_threshold module parameter,
which allows to conditionally use SGLs if average request segment
size (avg_seg_size) is greater than sgl_threshold. In the original patch,
the decision of using SGLs was dependent only on the IO size,
with the new approach we consider not only IO size but also the
number of physical segments present in the IO.

We calculate avg_seg_size based on request payload bytes and number
of physical segments present in the request.

For e.g.:-

1. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 8k
avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold.

2. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 64k
avg_seg_size = 32K use sgl if avg_seg_size >= sgl_threshold.

3. blk_rq_nr_phys_segments = 16 blk_rq_payload_bytes = 64k
avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 214 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 187 insertions(+), 27 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index bafdc2ab5be3c..11e6fd9d0ba49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -45,6 +45,8 @@
  */
 #define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AERS)
 
+#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
@@ -57,6 +59,12 @@ module_param(max_host_mem_size_mb, uint, 0444);
 MODULE_PARM_DESC(max_host_mem_size_mb,
 	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
 
+static unsigned int sgl_threshold = SZ_32K;
+module_param(sgl_threshold, uint, 0644);
+MODULE_PARM_DESC(sgl_threshold,
+		"Use SGLs when average request segment size is larger or equal to "
+		"this size. Use 0 to disable SGLs.");
+
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
 static const struct kernel_param_ops io_queue_depth_ops = {
 	.set = io_queue_depth_set,
@@ -178,6 +186,7 @@ struct nvme_queue {
 struct nvme_iod {
 	struct nvme_request req;
 	struct nvme_queue *nvmeq;
+	bool use_sgl;
 	int aborted;
 	int npages;		/* In the PRP list. 0 means small pool in use */
 	int nents;		/* Used in scatterlist */
@@ -331,17 +340,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
 	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 }
 
-static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
-		unsigned int size, unsigned int nseg)
+/*
+ * Calculates the number of pages needed for the SGL segments. For example a 4k
+ * page can accommodate 256 SGL descriptors.
+ */
+static int nvme_pci_npages_sgl(unsigned int num_seg)
+{
+	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
+}
+
+static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
+		unsigned int size, unsigned int nseg, bool use_sgl)
 {
-	return sizeof(__le64 *) * nvme_npages(size, dev) +
-			sizeof(struct scatterlist) * nseg;
+	size_t alloc_size;
+
+	if (use_sgl)
+		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
+	else
+		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
+
+	return alloc_size + sizeof(struct scatterlist) * nseg;
 }
 
-static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
 {
-	return sizeof(struct nvme_iod) +
-		nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
+	unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
+				    NVME_INT_BYTES(dev), NVME_INT_PAGES,
+				    use_sgl);
+
+	return sizeof(struct nvme_iod) + alloc_size;
 }
 
 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -425,10 +452,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 	nvmeq->sq_tail = tail;
 }
 
-static __le64 **iod_list(struct request *req)
+static void **nvme_pci_iod_list(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
+	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
 }
 
 static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
@@ -438,7 +465,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 	unsigned int size = blk_rq_payload_bytes(rq);
 
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
-		iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
+		size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
+				iod->use_sgl);
+
+		iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
 		if (!iod->sg)
 			return BLK_STS_RESOURCE;
 	} else {
@@ -456,18 +486,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
 static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	const int last_prp = dev->ctrl.page_size / 8 - 1;
+	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
+	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+
 	int i;
-	__le64 **list = iod_list(req);
-	dma_addr_t prp_dma = iod->first_dma;
 
 	if (iod->npages == 0)
-		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+			dma_addr);
+
 	for (i = 0; i < iod->npages; i++) {
-		__le64 *prp_list = list[i];
-		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
-		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
-		prp_dma = next_prp_dma;
+		void *addr = nvme_pci_iod_list(req)[i];
+
+		if (iod->use_sgl) {
+			struct nvme_sgl_desc *sg_list = addr;
+
+			next_dma_addr =
+			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
+		} else {
+			__le64 *prp_list = addr;
+
+			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+		}
+
+		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
+		dma_addr = next_dma_addr;
 	}
 
 	if (iod->sg != iod->inline_sg)
@@ -555,7 +598,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
 	}
 }
 
-static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmnd)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct dma_pool *pool;
@@ -566,14 +610,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
 	u32 page_size = dev->ctrl.page_size;
 	int offset = dma_addr & (page_size - 1);
 	__le64 *prp_list;
-	__le64 **list = iod_list(req);
+	void **list = nvme_pci_iod_list(req);
 	dma_addr_t prp_dma;
 	int nprps, i;
 
+	iod->use_sgl = false;
+
 	length -= (page_size - offset);
 	if (length <= 0) {
 		iod->first_dma = 0;
-		return BLK_STS_OK;
+		goto done;
 	}
 
 	dma_len -= (page_size - offset);
@@ -587,7 +633,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
 
 	if (length <= page_size) {
 		iod->first_dma = dma_addr;
-		return BLK_STS_OK;
+		goto done;
 	}
 
 	nprps = DIV_ROUND_UP(length, page_size);
@@ -634,6 +680,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
 		dma_len = sg_dma_len(sg);
 	}
 
+done:
+	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
 	return BLK_STS_OK;
 
  bad_sgl:
@@ -643,6 +693,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req)
 	return BLK_STS_IOERR;
 }
 
+static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
+		struct scatterlist *sg)
+{
+	sge->addr = cpu_to_le64(sg_dma_address(sg));
+	sge->length = cpu_to_le32(sg_dma_len(sg));
+	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
+static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
+		dma_addr_t dma_addr, int entries)
+{
+	sge->addr = cpu_to_le64(dma_addr);
+	if (entries < SGES_PER_PAGE) {
+		sge->length = cpu_to_le32(entries * sizeof(*sge));
+		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
+	} else {
+		sge->length = cpu_to_le32(PAGE_SIZE);
+		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
+	}
+}
+
+static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	int length = blk_rq_payload_bytes(req);
+	struct dma_pool *pool;
+	struct nvme_sgl_desc *sg_list;
+	struct scatterlist *sg = iod->sg;
+	int entries = iod->nents, i = 0;
+	dma_addr_t sgl_dma;
+
+	iod->use_sgl = true;
+
+	/* setting the transfer type as SGL */
+	cmd->flags = NVME_CMD_SGL_METABUF;
+
+	if (length == sg_dma_len(sg)) {
+		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+		return BLK_STS_OK;
+	}
+
+	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+	if (!sg_list) {
+		iod->npages = -1;
+		return BLK_STS_RESOURCE;
+	}
+
+	nvme_pci_iod_list(req)[0] = sg_list;
+	iod->first_dma = sgl_dma;
+
+	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+
+	do {
+		if (i == SGES_PER_PAGE) {
+			struct nvme_sgl_desc *old_sg_desc = sg_list;
+			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
+
+			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+			if (!sg_list)
+				return BLK_STS_RESOURCE;
+
+			i = 0;
+			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
+			sg_list[i++] = *link;
+			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+		}
+
+		nvme_pci_sgl_set_data(&sg_list[i++], sg);
+
+		length -= sg_dma_len(sg);
+		sg = sg_next(sg);
+		entries--;
+	} while (length > 0);
+
+	WARN_ON(entries > 0);
+	return BLK_STS_OK;
+}
+
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	unsigned int avg_seg_size;
+
+	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
+			blk_rq_nr_phys_segments(req));
+
+	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+		return false;
+	if (!iod->nvmeq->qid)
+		return false;
+	if (!sgl_threshold || avg_seg_size < sgl_threshold)
+		return false;
+	return true;
+}
+
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
@@ -662,7 +816,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 				DMA_ATTR_NO_WARN))
 		goto out;
 
-	ret = nvme_setup_prps(dev, req);
+	if (nvme_pci_use_sgls(dev, req))
+		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+	else
+		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+
 	if (ret != BLK_STS_OK)
 		goto out_unmap;
 
@@ -682,8 +840,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 			goto out_unmap;
 	}
 
-	cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma);
 	if (blk_integrity_rq(req))
 		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
 	return BLK_STS_OK;
@@ -1379,7 +1535,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
-		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
 		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
 		dev->admin_tagset.driver_data = dev;
 
@@ -1906,7 +2062,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
-		dev->tagset.cmd_size = nvme_cmd_size(dev);
+		dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+		if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
+			dev->tagset.cmd_size = max(dev->tagset.cmd_size,
+					nvme_pci_cmd_size(dev, true));
+		}
 		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 		dev->tagset.driver_data = dev;
 
-- 
cgit v1.2.3


From 0a02e39fd1eb2ceb3dd0dc765cb2de4d09697a14 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 19 Oct 2017 16:11:38 -0700
Subject: nvme-fc: correct io termination handling

The io completion handling for i/o's that are failing due to
to a transport error or association termination had issues, causing
io failures (DNR set so retries didn't kick in) or long stalls.

Change the io completion handler for the following items:

When an io has been completed due to a transport abort (based on an
exchange error) or when marked as aborted as part of an association
termination (FCOP_FLAGS_TERMIO), set the NVME completion status to
NVME_SC_ABORTED. By default, do not set DNR on the status so that a
retry can be attempted after association recreate.

In cases where an io is failed (non-successful nvme status including
aborted), if the controller is being deleted (blk_queue_dying) or
the io was part of the ios used for association creation (ctrl state
is NEW or RECONNECTING), then additionally set the DNR bit so the io
will not be retried. If the failed io was part of association creation,
the failure will tear down the partially completioned association and
typically restart a new reconnect attempt (another create association
later).

Rearranged code flow to remove a largely unneeded local variable.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0d0e898912add..e1a7e5cdfcf4e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1387,7 +1387,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	struct nvme_command *sqe = &op->cmd_iu.sqe;
 	__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
 	union nvme_result result;
-	bool complete_rq, terminate_assoc = true;
+	bool terminate_assoc = true;
 
 	/*
 	 * WARNING:
@@ -1429,8 +1429,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
 				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
 
-	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
-		status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED ||
+			op->flags & FCOP_FLAGS_TERMIO)
+		status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
 	else if (freq->status)
 		status = cpu_to_le16(NVME_SC_INTERNAL << 1);
 
@@ -1494,23 +1495,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 done:
 	if (op->flags & FCOP_FLAGS_AEN) {
 		nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
-		complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
+		__nvme_fc_fcpop_chk_teardowns(ctrl, op);
 		atomic_set(&op->state, FCPOP_STATE_IDLE);
 		op->flags = FCOP_FLAGS_AEN;	/* clear other flags */
 		nvme_fc_ctrl_put(ctrl);
 		goto check_error;
 	}
 
-	complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
-	if (!complete_rq) {
-		if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
-			status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
-			if (blk_queue_dying(rq->q))
-				status |= cpu_to_le16(NVME_SC_DNR << 1);
-		}
-		nvme_end_request(rq, status, result);
-	} else
+	/*
+	 * Force failures of commands if we're killing the controller
+	 * or have an error on a command used to create an new association
+	 */
+	if (status &&
+	    (blk_queue_dying(rq->q) ||
+	     ctrl->ctrl.state == NVME_CTRL_NEW ||
+	     ctrl->ctrl.state == NVME_CTRL_RECONNECTING))
+		status |= cpu_to_le16(NVME_SC_DNR << 1);
+
+	if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
 		__nvme_fc_final_op_cleanup(rq);
+	else
+		nvme_end_request(rq, status, result);
 
 check_error:
 	if (terminate_assoc)
-- 
cgit v1.2.3


From 134aedc9c157d49069e9a98636b0a917678586ee Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 19 Oct 2017 16:11:39 -0700
Subject: nvme-fc: correct io timeout behavior

The transport io timeout behavior wasn't quite correct. It ignored
that the io error handler is supposed to be synchronous so it possibly
allowed the blk request to be restarted while the io associated was
still aborting. Timeouts on reserved commands, those used for
association create, were never timing out thus they hung out forever.

To correct:
If an io is times out while a remoteport is not connected, just
restart the io timer. The lack of connectivity will simultaneously
be resetting the controller, so the reset path will abort and terminate
the io.

If an io is times out while it was marked for transport abort, just
reset the io timer. The abort process is underway and will complete
the io.

Otherwise, if an io times out, abort the io. If the abort was
unsuccessful (unlikely) give up and return not handled.

If the abort was successful, as the abort process is underway it will
terminate the io, so rather than synchronously waiting, just restart
the io timer.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e1a7e5cdfcf4e..c6c903f1b172a 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1903,13 +1903,14 @@ nvme_fc_timeout(struct request *rq, bool reserved)
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 	int ret;
 
-	if (reserved)
+	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
+			atomic_read(&op->state) == FCPOP_STATE_ABORTED)
 		return BLK_EH_RESET_TIMER;
 
 	ret = __nvme_fc_abort_op(ctrl, op);
 	if (ret)
-		/* io wasn't active to abort consider it done */
-		return BLK_EH_HANDLED;
+		/* io wasn't active to abort */
+		return BLK_EH_NOT_HANDLED;
 
 	/*
 	 * we can't individually ABTS an io without affecting the queue,
@@ -1920,7 +1921,12 @@ nvme_fc_timeout(struct request *rq, bool reserved)
 	 */
 	nvme_fc_error_recovery(ctrl, "io timeout error");
 
-	return BLK_EH_HANDLED;
+	/*
+	 * the io abort has been initiated. Have the reset timer
+	 * restarted and the abort completion will complete the io
+	 * shortly. Avoids a synchronous wait while the abort finishes.
+	 */
+	return BLK_EH_RESET_TIMER;
 }
 
 static int
-- 
cgit v1.2.3


From f87c89ad93c95857c4d4b19ca580e27994254b7f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 23 Oct 2017 12:59:27 +0300
Subject: nvme-rdma: align nvme_rdma_device structure

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a5577e06a6204..ff4635d317b5b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -50,8 +50,8 @@
 	(NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
 
 struct nvme_rdma_device {
-	struct ib_device       *dev;
-	struct ib_pd	       *pd;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
 	struct kref		ref;
 	struct list_head	entry;
 };
-- 
cgit v1.2.3


From e62a538da2e7401bad5293d210ce00583ac12f39 Mon Sep 17 00:00:00 2001
From: Nitzan Carmi <nitzanc@mellanox.com>
Date: Sun, 22 Oct 2017 09:37:04 +0000
Subject: nvme-rdma: Add debug message when reaches timeout

Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ff4635d317b5b..4552744eff454 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1578,6 +1578,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 
+	dev_warn(req->queue->ctrl->ctrl.device,
+		 "I/O %d QID %d timeout, reset controller\n",
+		 rq->tag, nvme_rdma_queue_idx(req->queue));
+
 	/* queue error recovery */
 	nvme_rdma_error_recovery(req->queue->ctrl);
 
-- 
cgit v1.2.3


From 75bc5f06617fe1bc9a79ba9e3baccdcae3743404 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Tue, 24 Oct 2017 15:56:13 +0200
Subject: lightnvm: pblk: remove leftover testing function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A previous patch inadvertently left an unused test function in the
header, kill it.

Fixes: 8bd400204bd5 ("lightnvm: pblk: cleanup unused and static functions")
Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 6b64288de6f76..90961033a79fc 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -860,11 +860,6 @@ int pblk_rl_is_limit(struct pblk_rl *rl);
 int pblk_sysfs_init(struct gendisk *tdisk);
 void pblk_sysfs_exit(struct gendisk *tdisk);
 
-static inline void test(size_t a)
-{
-	a += 1;
-}
-
 static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
 {
 	if (type == PBLK_KMALLOC_META)
-- 
cgit v1.2.3


From 351499a172c0c5fc52d65ee2c62b344f369ea02a Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 24 Oct 2017 18:44:57 -0600
Subject: block: Invalidate cache on discard v2

It is reasonable drop page cache on discard, otherwise that pages may
be written by writeback second later, so thin provision devices will
not be happy. This seems to be a  security leak in case of secure discard case.

Also add check for queue_discard flag on early stage.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee67eed8..c0fc32bd8ed17 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -202,10 +202,16 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 {
 	uint64_t range[2];
 	uint64_t start, len;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+
 
 	if (!(mode & FMODE_WRITE))
 		return -EBADF;
 
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
 	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 		return -EFAULT;
 
@@ -216,12 +222,12 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
 
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (start + len > i_size_read(bdev->bd_inode))
 		return -EINVAL;
-	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
+	truncate_inode_pages_range(mapping, start, start + len);
+	return blkdev_issue_discard(bdev, start >> 9, len >> 9,
+				    GFP_KERNEL, flags);
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
-- 
cgit v1.2.3


From bb749b31c25e9b11f8f974baac8d507298ffbb70 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 18 Oct 2017 14:38:38 +0200
Subject: block: move CAP_SYS_ADMIN check in blkdev_roset()

Check for CAP_SYS_ADMIN before calling into the driver, similar to
blkdev_flushbuf().  This is safer and can spare a check in the driver.

(Currently BLKROSET is overridden by md and rbd, rbd is missing the
check.  md has the check, but it covers a lot more than BLKROSET.)

Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index c0fc32bd8ed17..1668506d8ed80 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -443,11 +443,12 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode,
 {
 	int ret, n;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	if (!is_unrecognized_ioctl(ret))
 		return ret;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
 	if (get_user(n, (int __user *)arg))
 		return -EFAULT;
 	set_device_ro(bdev, n);
-- 
cgit v1.2.3


From 425a4dba7953e35ffd096771973add6d2f40d2ed Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Oct 2017 15:59:09 +0200
Subject: block: factor out __blkdev_issue_zero_pages()

blkdev_issue_zeroout() will use this in !BLKDEV_ZERO_NOFALLBACK case.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 63 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 62240f8832ca6..9d2ab8bba52ac 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -274,6 +274,40 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
 	return min(pages, (sector_t)BIO_MAX_PAGES);
 }
 
+static int __blkdev_issue_zero_pages(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio = *biop;
+	int bi_size = 0;
+	unsigned int sz;
+
+	if (!q)
+		return -ENXIO;
+
+	while (nr_sects != 0) {
+		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
+			       gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, bdev);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+		while (nr_sects != 0) {
+			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
+			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
+			nr_sects -= bi_size >> 9;
+			sector += bi_size >> 9;
+			if (bi_size < sz)
+				break;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
 /**
  * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
@@ -304,9 +338,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		unsigned flags)
 {
 	int ret;
-	int bi_size = 0;
-	struct bio *bio = *biop;
-	unsigned int sz;
 	sector_t bs_mask;
 
 	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
@@ -316,30 +347,10 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
 			biop, flags);
 	if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
-		goto out;
-
-	ret = 0;
-	while (nr_sects != 0) {
-		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-			       gfp_mask);
-		bio->bi_iter.bi_sector = sector;
-		bio_set_dev(bio, bdev);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-		while (nr_sects != 0) {
-			sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
-			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
-			nr_sects -= bi_size >> 9;
-			sector += bi_size >> 9;
-			if (bi_size < sz)
-				break;
-		}
-		cond_resched();
-	}
+		return ret;
 
-	*biop = bio;
-out:
-	return ret;
+	return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
+					 biop);
 }
 EXPORT_SYMBOL(__blkdev_issue_zeroout);
 
-- 
cgit v1.2.3


From d5ce4c31d6df518dd8f63bbae20d7423c5018a6c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Oct 2017 15:59:10 +0200
Subject: block: cope with WRITE ZEROES failing in blkdev_issue_zeroout()

sd_config_write_same() ignores ->max_ws_blocks == 0 and resets it to
permit trying WRITE SAME on older SCSI devices, unless ->no_write_same
is set.  Because REQ_OP_WRITE_ZEROES is implemented in terms of WRITE
SAME, blkdev_issue_zeroout() may fail with -EREMOTEIO:

  $ fallocate -zn -l 1k /dev/sdg
  fallocate: fallocate failed: Remote I/O error
  $ fallocate -zn -l 1k /dev/sdg  # OK
  $ fallocate -zn -l 1k /dev/sdg  # OK

The following calls succeed because sd_done() sets ->no_write_same in
response to a sense that would become BLK_STS_TARGET/-EREMOTEIO, causing
__blkdev_issue_zeroout() to fall back to generating ZERO_PAGE bios.

This means blkdev_issue_zeroout() must cope with WRITE ZEROES failing
and fall back to manually zeroing, unless BLKDEV_ZERO_NOFALLBACK is
specified.  For BLKDEV_ZERO_NOFALLBACK case, return -EOPNOTSUPP if
sd_done() has just set ->no_write_same thus indicating lack of offload
support.

Fixes: c20cfc27a473 ("block: stop using blkdev_issue_write_same for zeroing")
Cc: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9d2ab8bba52ac..f625fda5f0955 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -321,12 +321,6 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
  *  Zero-fill a block range, either using hardware offload or by explicitly
  *  writing zeroes to the device.
  *
- *  Note that this function may fail with -EOPNOTSUPP if the driver signals
- *  zeroing offload support, but the device fails to process the command (for
- *  some devices there is no non-destructive way to verify whether this
- *  operation is actually supported).  In this case the caller should call
- *  retry the call to blkdev_issue_zeroout() and the fallback path will be used.
- *
  *  If a device is using logical block provisioning, the underlying space will
  *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
  *
@@ -370,18 +364,49 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
-	int ret;
-	struct bio *bio = NULL;
+	int ret = 0;
+	sector_t bs_mask;
+	struct bio *bio;
 	struct blk_plug plug;
+	bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
 
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+retry:
+	bio = NULL;
 	blk_start_plug(&plug);
-	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-			&bio, flags);
+	if (try_write_zeroes) {
+		ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
+						  gfp_mask, &bio, flags);
+	} else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+		ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
+						gfp_mask, &bio);
+	} else {
+		/* No zeroing offload support */
+		ret = -EOPNOTSUPP;
+	}
 	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
 	}
 	blk_finish_plug(&plug);
+	if (ret && try_write_zeroes) {
+		if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
+			try_write_zeroes = false;
+			goto retry;
+		}
+		if (!bdev_write_zeroes_sectors(bdev)) {
+			/*
+			 * Zeroing offload support was indicated, but the
+			 * device reported ILLEGAL REQUEST (for some devices
+			 * there is no non-destructive way to verify whether
+			 * WRITE ZEROES is actually supported).
+			 */
+			ret = -EOPNOTSUPP;
+		}
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 2527d99789e248576ac8081530cd4fd88730f8c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 12:33:42 -0600
Subject: elevator: lookup mq vs non-mq elevators

If an IO scheduler is selected via elevator= and it doesn't match
the driver in question wrt blk-mq support, then we fail to boot.

The elevator= parameter is deprecated and only supported for
non-mq devices. Augment the elevator lookup API so that we
pass in if we're looking for an mq capable scheduler or not,
so that we only ever return a valid type for the queue in
question.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196695
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 7ae50eb2732bb..77856bf295688 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,12 +83,15 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
-static struct elevator_type *elevator_find(const char *name)
+/*
+ * Return scheduler with name 'name' and with matching 'mq capability
+ */
+static struct elevator_type *elevator_find(const char *name, bool mq)
 {
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (!strcmp(e->elevator_name, name))
+		if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq))
 			return e;
 	}
 
@@ -100,25 +103,25 @@ static void elevator_put(struct elevator_type *e)
 	module_put(e->elevator_owner);
 }
 
-static struct elevator_type *elevator_get(const char *name, bool try_loading)
+static struct elevator_type *elevator_get(struct request_queue *q,
+					  const char *name, bool try_loading)
 {
 	struct elevator_type *e;
 
 	spin_lock(&elv_list_lock);
 
-	e = elevator_find(name);
+	e = elevator_find(name, q->mq_ops != NULL);
 	if (!e && try_loading) {
 		spin_unlock(&elv_list_lock);
 		request_module("%s-iosched", name);
 		spin_lock(&elv_list_lock);
-		e = elevator_find(name);
+		e = elevator_find(name, q->mq_ops != NULL);
 	}
 
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 
 	spin_unlock(&elv_list_lock);
-
 	return e;
 }
 
@@ -144,8 +147,12 @@ void __init load_default_elevator_module(void)
 	if (!chosen_elevator[0])
 		return;
 
+	/*
+	 * Boot parameter is deprecated, we haven't supported that for MQ.
+	 * Only look for non-mq schedulers from here.
+	 */
 	spin_lock(&elv_list_lock);
-	e = elevator_find(chosen_elevator);
+	e = elevator_find(chosen_elevator, false);
 	spin_unlock(&elv_list_lock);
 
 	if (!e)
@@ -202,7 +209,7 @@ int elevator_init(struct request_queue *q, char *name)
 	q->boundary_rq = NULL;
 
 	if (name) {
-		e = elevator_get(name, true);
+		e = elevator_get(q, name, true);
 		if (!e)
 			return -EINVAL;
 	}
@@ -214,7 +221,7 @@ int elevator_init(struct request_queue *q, char *name)
 	 * allowed from async.
 	 */
 	if (!e && !q->mq_ops && *chosen_elevator) {
-		e = elevator_get(chosen_elevator, false);
+		e = elevator_get(q, chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
 							chosen_elevator);
@@ -229,17 +236,17 @@ int elevator_init(struct request_queue *q, char *name)
 		 */
 		if (q->mq_ops) {
 			if (q->nr_hw_queues == 1)
-				e = elevator_get("mq-deadline", false);
+				e = elevator_get(q, "mq-deadline", false);
 			if (!e)
 				return 0;
 		} else
-			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+			e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
 
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
 				"Using noop.\n");
-			e = elevator_get("noop", false);
+			e = elevator_get(q, "noop", false);
 		}
 	}
 
@@ -905,7 +912,7 @@ int elv_register(struct elevator_type *e)
 
 	/* register, don't allow duplicate names */
 	spin_lock(&elv_list_lock);
-	if (elevator_find(e->elevator_name)) {
+	if (elevator_find(e->elevator_name, e->uses_mq)) {
 		spin_unlock(&elv_list_lock);
 		if (e->icq_cache)
 			kmem_cache_destroy(e->icq_cache);
@@ -1066,7 +1073,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
-	e = elevator_get(strstrip(elevator_name), true);
+	e = elevator_get(q, strstrip(elevator_name), true);
 	if (!e)
 		return -EINVAL;
 
@@ -1076,15 +1083,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return 0;
 	}
 
-	if (!e->uses_mq && q->mq_ops) {
-		elevator_put(e);
-		return -EINVAL;
-	}
-	if (e->uses_mq && !q->mq_ops) {
-		elevator_put(e);
-		return -EINVAL;
-	}
-
 	return elevator_switch(q, e);
 }
 
-- 
cgit v1.2.3


From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 12:35:02 -0600
Subject: elevator: allow name aliases

Since we now lookup elevator types with the appropriate multiqueue
capability, allow schedulers to register with an alias alongside
the real name. This is in preparation for allowing 'mq-deadline'
to register an alias of 'deadline' as well.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c         | 23 +++++++++++++++++------
 include/linux/elevator.h |  1 +
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 77856bf295688..7bda083d59684 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,6 +83,16 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
+static bool elevator_match(const struct elevator_type *e, const char *name)
+{
+	if (!strcmp(e->elevator_name, name))
+		return true;
+	if (e->elevator_alias && !strcmp(e->elevator_alias, name))
+		return true;
+
+	return false;
+}
+
 /*
  * Return scheduler with name 'name' and with matching 'mq capability
  */
@@ -91,7 +101,7 @@ static struct elevator_type *elevator_find(const char *name, bool mq)
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (!strcmp(e->elevator_name, name) && (mq == e->uses_mq))
+		if (elevator_match(e, name) && (mq == e->uses_mq))
 			return e;
 	}
 
@@ -922,9 +932,9 @@ int elv_register(struct elevator_type *e)
 	spin_unlock(&elv_list_lock);
 
 	/* print pretty message */
-	if (!strcmp(e->elevator_name, chosen_elevator) ||
+	if (elevator_match(e, chosen_elevator) ||
 			(!*chosen_elevator &&
-			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
+			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
 				def = " (default)";
 
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
@@ -1077,8 +1087,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	if (!e)
 		return -EINVAL;
 
-	if (q->elevator &&
-	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
@@ -1114,6 +1123,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
+	bool uses_mq = q->mq_ops != NULL;
 	int len = 0;
 
 	if (!queue_is_rq_based(q))
@@ -1126,7 +1136,8 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
+		if (elv && elevator_match(elv, __e->elevator_name) &&
+		    (__e->uses_mq == uses_mq)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 			continue;
 		}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 5bc8f8682a3e1..6df8b14f1f6a0 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -144,6 +144,7 @@ struct elevator_type
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_alias;
 	struct module *elevator_owner;
 	bool uses_mq;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3


From 4d740bc9f0319229410d11e445017f47e425dbe0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 25 Oct 2017 09:47:20 -0600
Subject: mq-deadline: add 'deadline' as a name alias

The scheduler framework now supports looking up the appropriate
scheduler with the {name,mq} tupple. We can register mq-deadline
with the alias of 'deadline', so that switching to 'deadline'
will do the right thing based on the type of driver attached to
it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/mq-deadline.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a1cad4331eddc..0179e484ec98c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -657,6 +657,7 @@ static struct elevator_type mq_deadline = {
 #endif
 	.elevator_attrs = deadline_attrs,
 	.elevator_name = "mq-deadline",
+	.elevator_alias = "deadline",
 	.elevator_owner = THIS_MODULE,
 };
 MODULE_ALIAS("mq-deadline-iosched");
-- 
cgit v1.2.3


From 2dd4122854f697afc777582d18548dded03ce5dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 13:20:01 +0200
Subject: nvme: use kref_get_unless_zero in nvme_find_get_ns

For kref_get_unless_zero to protect against lookup vs free races we need
to use it in all places where we aren't guaranteed to already hold a
reference.  There is no such guarantee in nvme_find_get_ns, so switch to
kref_get_unless_zero in this function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/nvme/host/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7fae42d595d5d..1d931deac83bc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2290,7 +2290,8 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		if (ns->ns_id == nsid) {
-			kref_get(&ns->kref);
+			if (!kref_get_unless_zero(&ns->kref))
+				continue;
 			ret = ns;
 			break;
 		}
-- 
cgit v1.2.3


From c6424a90da446cff6c67be06767fc0d0be787110 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 13:22:00 +0200
Subject: nvme: simplify nvme_open

Now that we are protected against lookup vs free races for the namespace
by using kref_get_unless_zero we don't need the hack of NULLing out the
disk private data during removal.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/nvme/host/core.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1d931deac83bc..9f8ae15c9fe8e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -253,12 +253,6 @@ static void nvme_free_ns(struct kref *kref)
 	if (ns->ndev)
 		nvme_nvm_unregister(ns);
 
-	if (ns->disk) {
-		spin_lock(&dev_list_lock);
-		ns->disk->private_data = NULL;
-		spin_unlock(&dev_list_lock);
-	}
-
 	put_disk(ns->disk);
 	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
 	nvme_put_ctrl(ns->ctrl);
@@ -270,29 +264,6 @@ static void nvme_put_ns(struct nvme_ns *ns)
 	kref_put(&ns->kref, nvme_free_ns);
 }
 
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
-{
-	struct nvme_ns *ns;
-
-	spin_lock(&dev_list_lock);
-	ns = disk->private_data;
-	if (ns) {
-		if (!kref_get_unless_zero(&ns->kref))
-			goto fail;
-		if (!try_module_get(ns->ctrl->ops->module))
-			goto fail_put_ns;
-	}
-	spin_unlock(&dev_list_lock);
-
-	return ns;
-
-fail_put_ns:
-	kref_put(&ns->kref, nvme_free_ns);
-fail:
-	spin_unlock(&dev_list_lock);
-	return NULL;
-}
-
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, unsigned int flags, int qid)
 {
@@ -1056,7 +1027,16 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 
 static int nvme_open(struct block_device *bdev, fmode_t mode)
 {
-	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	if (!kref_get_unless_zero(&ns->kref))
+		return -ENXIO;
+	if (!try_module_get(ns->ctrl->ops->module)) {
+		kref_put(&ns->kref, nvme_free_ns);
+		return -ENXIO;
+	}
+
+	return 0;
 }
 
 static void nvme_release(struct gendisk *disk, fmode_t mode)
-- 
cgit v1.2.3


From d22524a4782a943bb02a9cf6885ac470210aabfc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 13:25:42 +0200
Subject: nvme: switch controller refcounting to use struct device

Instead of allocating a separate struct device for the character device
handle embedd it into struct nvme_ctrl and use it for the main controller
refcounting.  This removes double refcounting and gets us an automatic
reference for the character device operations.  We keep ctrl->device as a
pointer for now to avoid chaning printks all over, but in the future we
could look into message printing helpers that take a controller structure
similar to what other subsystems do.

Note the delete_ctrl operation always already has a reference (either
through sysfs due this change, or because every open file on the
/dev/nvme-fabrics node has a refernece) when it is entered now, so we
don't need to do the unless_zero variant there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/nvme/host/core.c   | 43 ++++++++++++++++++++++---------------------
 drivers/nvme/host/fc.c     |  8 ++------
 drivers/nvme/host/nvme.h   | 12 +++++++++++-
 drivers/nvme/host/pci.c    |  2 +-
 drivers/nvme/host/rdma.c   |  5 ++---
 drivers/nvme/target/loop.c |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9f8ae15c9fe8e..3a97daa163f66 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1915,7 +1915,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
 			ret = -EWOULDBLOCK;
 			break;
 		}
-		if (!kref_get_unless_zero(&ctrl->kref))
+		if (!kobject_get_unless_zero(&ctrl->device->kobj))
 			break;
 		file->private_data = ctrl;
 		ret = 0;
@@ -2374,7 +2374,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	mutex_unlock(&ctrl->namespaces_mutex);
 
-	kref_get(&ctrl->kref);
+	nvme_get_ctrl(ctrl);
 
 	kfree(id);
 
@@ -2703,7 +2703,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
-	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
+	device_del(ctrl->device);
 
 	spin_lock(&dev_list_lock);
 	list_del(&ctrl->node);
@@ -2711,23 +2711,17 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
 
-static void nvme_free_ctrl(struct kref *kref)
+static void nvme_free_ctrl(struct device *dev)
 {
-	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+	struct nvme_ctrl *ctrl =
+		container_of(dev, struct nvme_ctrl, ctrl_device);
 
-	put_device(ctrl->device);
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	ida_destroy(&ctrl->ns_ida);
 
 	ctrl->ops->free_ctrl(ctrl);
 }
 
-void nvme_put_ctrl(struct nvme_ctrl *ctrl)
-{
-	kref_put(&ctrl->kref, nvme_free_ctrl);
-}
-EXPORT_SYMBOL_GPL(nvme_put_ctrl);
-
 /*
  * Initialize a NVMe controller structures.  This needs to be called during
  * earliest initialization so that we have the initialized structured around
@@ -2742,7 +2736,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	spin_lock_init(&ctrl->lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	mutex_init(&ctrl->namespaces_mutex);
-	kref_init(&ctrl->kref);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -2755,15 +2748,21 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 		goto out;
 	ctrl->instance = ret;
 
-	ctrl->device = device_create_with_groups(nvme_class, ctrl->dev,
-				MKDEV(nvme_char_major, ctrl->instance),
-				ctrl, nvme_dev_attr_groups,
-				"nvme%d", ctrl->instance);
-	if (IS_ERR(ctrl->device)) {
-		ret = PTR_ERR(ctrl->device);
+	device_initialize(&ctrl->ctrl_device);
+	ctrl->device = &ctrl->ctrl_device;
+	ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance);
+	ctrl->device->class = nvme_class;
+	ctrl->device->parent = ctrl->dev;
+	ctrl->device->groups = nvme_dev_attr_groups;
+	ctrl->device->release = nvme_free_ctrl;
+	dev_set_drvdata(ctrl->device, ctrl);
+	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
+	if (ret)
 		goto out_release_instance;
-	}
-	get_device(ctrl->device);
+	ret = device_add(ctrl->device);
+	if (ret)
+		goto out_free_name;
+
 	ida_init(&ctrl->ns_ida);
 
 	spin_lock(&dev_list_lock);
@@ -2779,6 +2778,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
 
 	return 0;
+out_free_name:
+	kfree_const(dev->kobj.name);
 out_release_instance:
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index c6c903f1b172a..aa9aec6923bb7 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2692,14 +2692,10 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 	int ret;
 
-	if (!kref_get_unless_zero(&ctrl->ctrl.kref))
-		return -EBUSY;
-
+	nvme_get_ctrl(&ctrl->ctrl);
 	ret = __nvme_fc_del_ctrl(ctrl);
-
 	if (!ret)
 		flush_workqueue(nvme_wq);
-
 	nvme_put_ctrl(&ctrl->ctrl);
 
 	return ret;
@@ -2918,7 +2914,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 		return ERR_PTR(ret);
 	}
 
-	kref_get(&ctrl->ctrl.kref);
+	nvme_get_ctrl(&ctrl->ctrl);
 
 	dev_info(ctrl->ctrl.device,
 		"NVME-FC{%d}: new ctrl: NQN \"%s\"\n",
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cb9d93048f3db..ae60d8342e601 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -127,12 +127,12 @@ struct nvme_ctrl {
 	struct request_queue *admin_q;
 	struct request_queue *connect_q;
 	struct device *dev;
-	struct kref kref;
 	int instance;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
 	struct mutex namespaces_mutex;
+	struct device ctrl_device;
 	struct device *device;	/* char device */
 	struct list_head node;
 	struct ida ns_ida;
@@ -279,6 +279,16 @@ static inline void nvme_end_request(struct request *req, __le16 status,
 	blk_mq_complete_request(req);
 }
 
+static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
+{
+	get_device(ctrl->device);
+}
+
+static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+	put_device(ctrl->device);
+}
+
 void nvme_complete_rq(struct request *req);
 void nvme_cancel_request(struct request *req, void *data, bool reserved);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 11e6fd9d0ba49..7735571ffc9ab 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2292,7 +2292,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 {
 	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
 
-	kref_get(&dev->ctrl.kref);
+	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
 	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4552744eff454..62b58f9b7d00e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1793,8 +1793,7 @@ static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
 	 * Keep a reference until all work is flushed since
 	 * __nvme_rdma_del_ctrl can free the ctrl mem
 	 */
-	if (!kref_get_unless_zero(&ctrl->ctrl.kref))
-		return -EBUSY;
+	nvme_get_ctrl(&ctrl->ctrl);
 	ret = __nvme_rdma_del_ctrl(ctrl);
 	if (!ret)
 		flush_work(&ctrl->delete_work);
@@ -1955,7 +1954,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
 		ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
 
-	kref_get(&ctrl->ctrl.kref);
+	nvme_get_ctrl(&ctrl->ctrl);
 
 	mutex_lock(&nvme_rdma_ctrl_mutex);
 	list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index c56354e1e4c61..f83e925fe64a6 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -642,7 +642,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 	dev_info(ctrl->ctrl.device,
 		 "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
 
-	kref_get(&ctrl->ctrl.kref);
+	nvme_get_ctrl(&ctrl->ctrl);
 
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
 	WARN_ON_ONCE(!changed);
-- 
cgit v1.2.3


From a6a5149b10ec8ab8b4a9479a8230265c1b573be0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 16:59:25 +0200
Subject: nvme: get rid of nvme_ctrl_list

Use the core chrdev code to set up the link between the character device
and the nvme controller.  This allows us to get rid of the global list
of all controllers, and also ensures that we have both a reference to
the controller and the transport module before the open method of the
character device is called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sgi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 drivers/nvme/host/core.c | 76 ++++++++++--------------------------------------
 drivers/nvme/host/nvme.h |  3 +-
 2 files changed, 18 insertions(+), 61 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3a97daa163f66..a56a1e0432e7b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5;
 module_param_named(max_retries, nvme_max_retries, byte, 0644);
 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
 
-static int nvme_char_major;
-module_param(nvme_char_major, int, 0);
-
 static unsigned long default_ps_max_latency_us = 100000;
 module_param(default_ps_max_latency_us, ulong, 0644);
 MODULE_PARM_DESC(default_ps_max_latency_us,
@@ -71,11 +68,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
 struct workqueue_struct *nvme_wq;
 EXPORT_SYMBOL_GPL(nvme_wq);
 
-static LIST_HEAD(nvme_ctrl_list);
-static DEFINE_SPINLOCK(dev_list_lock);
-
 static DEFINE_IDA(nvme_instance_ida);
-
+static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 
 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
@@ -1031,20 +1025,12 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
 
 	if (!kref_get_unless_zero(&ns->kref))
 		return -ENXIO;
-	if (!try_module_get(ns->ctrl->ops->module)) {
-		kref_put(&ns->kref, nvme_free_ns);
-		return -ENXIO;
-	}
-
 	return 0;
 }
 
 static void nvme_release(struct gendisk *disk, fmode_t mode)
 {
-	struct nvme_ns *ns = disk->private_data;
-
-	module_put(ns->ctrl->ops->module);
-	nvme_put_ns(ns);
+	nvme_put_ns(disk->private_data);
 }
 
 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1902,33 +1888,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify);
 
 static int nvme_dev_open(struct inode *inode, struct file *file)
 {
-	struct nvme_ctrl *ctrl;
-	int instance = iminor(inode);
-	int ret = -ENODEV;
-
-	spin_lock(&dev_list_lock);
-	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
-		if (ctrl->instance != instance)
-			continue;
-
-		if (!ctrl->admin_q) {
-			ret = -EWOULDBLOCK;
-			break;
-		}
-		if (!kobject_get_unless_zero(&ctrl->device->kobj))
-			break;
-		file->private_data = ctrl;
-		ret = 0;
-		break;
-	}
-	spin_unlock(&dev_list_lock);
-
-	return ret;
-}
+	struct nvme_ctrl *ctrl =
+		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
 
-static int nvme_dev_release(struct inode *inode, struct file *file)
-{
-	nvme_put_ctrl(file->private_data);
+	if (!ctrl->admin_q)
+		return -EWOULDBLOCK;
+	file->private_data = ctrl;
 	return 0;
 }
 
@@ -1992,7 +1957,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 static const struct file_operations nvme_dev_fops = {
 	.owner		= THIS_MODULE,
 	.open		= nvme_dev_open,
-	.release	= nvme_dev_release,
 	.unlocked_ioctl	= nvme_dev_ioctl,
 	.compat_ioctl	= nvme_dev_ioctl,
 };
@@ -2703,11 +2667,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
-	device_del(ctrl->device);
-
-	spin_lock(&dev_list_lock);
-	list_del(&ctrl->node);
-	spin_unlock(&dev_list_lock);
+	cdev_device_del(&ctrl->cdev, ctrl->device);
 }
 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
 
@@ -2750,7 +2710,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	device_initialize(&ctrl->ctrl_device);
 	ctrl->device = &ctrl->ctrl_device;
-	ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance);
+	ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
 	ctrl->device->class = nvme_class;
 	ctrl->device->parent = ctrl->dev;
 	ctrl->device->groups = nvme_dev_attr_groups;
@@ -2759,16 +2719,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
 	if (ret)
 		goto out_release_instance;
-	ret = device_add(ctrl->device);
+
+	cdev_init(&ctrl->cdev, &nvme_dev_fops);
+	ctrl->cdev.owner = ops->module;
+	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
 	if (ret)
 		goto out_free_name;
 
 	ida_init(&ctrl->ns_ida);
 
-	spin_lock(&dev_list_lock);
-	list_add_tail(&ctrl->node, &nvme_ctrl_list);
-	spin_unlock(&dev_list_lock);
-
 	/*
 	 * Initialize latency tolerance controls.  The sysfs files won't
 	 * be visible to userspace unless the device actually supports APST.
@@ -2909,12 +2868,9 @@ int __init nvme_core_init(void)
 	if (!nvme_wq)
 		return -ENOMEM;
 
-	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
-							&nvme_dev_fops);
+	result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
 	if (result < 0)
 		goto destroy_wq;
-	else if (result > 0)
-		nvme_char_major = result;
 
 	nvme_class = class_create(THIS_MODULE, "nvme");
 	if (IS_ERR(nvme_class)) {
@@ -2925,7 +2881,7 @@ int __init nvme_core_init(void)
 	return 0;
 
 unregister_chrdev:
-	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
 destroy_wq:
 	destroy_workqueue(nvme_wq);
 	return result;
@@ -2934,7 +2890,7 @@ destroy_wq:
 void nvme_core_exit(void)
 {
 	class_destroy(nvme_class);
-	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
 	destroy_workqueue(nvme_wq);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ae60d8342e601..1bb2bc165e543 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -15,6 +15,7 @@
 #define _NVME_H
 
 #include <linux/nvme.h>
+#include <linux/cdev.h>
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
@@ -134,7 +135,7 @@ struct nvme_ctrl {
 	struct mutex namespaces_mutex;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
-	struct list_head node;
+	struct cdev cdev;
 	struct ida ns_ida;
 	struct work_struct reset_work;
 
-- 
cgit v1.2.3


From 999ada28713d7bcf4a2c70cbab47b08cd95f2cf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 18 Oct 2017 17:09:31 +0200
Subject: nvme: check for a live controller in nvme_dev_open

This is a much more sensible check than just the admin queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@rimbeg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a56a1e0432e7b..df525ab42fcd1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1891,7 +1891,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
 	struct nvme_ctrl *ctrl =
 		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
 
-	if (!ctrl->admin_q)
+	if (ctrl->state != NVME_CTRL_LIVE)
 		return -EWOULDBLOCK;
 	file->private_data = ctrl;
 	return 0;
-- 
cgit v1.2.3


From 3b3387620780fc9699021c85bdce5cb45a763d41 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 20 Oct 2017 16:17:06 -0700
Subject: nvme: add duplicate_connect option

Add the "duplicate_connect" boolean option (presence means true).
Default is false.

When false, the transport should validate whether a new controller request
is targeted for the same host transport addressing and target transport
addressing as an existing controller. If so, the new controller request
should be rejected.

When true, the callee is explicitly requesting a duplicate controller
connection to be made and the new request should be attempted.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 7 ++++++-
 drivers/nvme/host/fabrics.h | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 8bca36a469245..4a83137b02684 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -548,6 +548,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOSTNQN,		"hostnqn=%s"		},
 	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
+	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->nr_io_queues = num_online_cpus();
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
 	opts->kato = NVME_DEFAULT_KATO;
+	opts->duplicate_connect = false;
 
 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
@@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 				goto out;
 			}
 			break;
+		case NVMF_OPT_DUP_CONNECT:
+			opts->duplicate_connect = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 #define NVMF_REQUIRED_OPTS	(NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
-				 NVMF_OPT_HOST_ID)
+				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index bf33663218cd2..a7590cc59ba26 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -57,6 +57,7 @@ enum {
 	NVMF_OPT_HOST_TRADDR	= 1 << 10,
 	NVMF_OPT_CTRL_LOSS_TMO	= 1 << 11,
 	NVMF_OPT_HOST_ID	= 1 << 12,
+	NVMF_OPT_DUP_CONNECT	= 1 << 13,
 };
 
 /**
@@ -96,6 +97,7 @@ struct nvmf_ctrl_options {
 	unsigned int		nr_io_queues;
 	unsigned int		reconnect_delay;
 	bool			discovery_nqn;
+	bool			duplicate_connect;
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
-- 
cgit v1.2.3


From 991231dc48ae9bcdd363ee231f10beb771d455c1 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 20 Oct 2017 16:17:07 -0700
Subject: nvme: add helper to compare options to controller

Adds a helper function that compares the host and subsytem
specified in a connect options list vs a controller.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a7590cc59ba26..42232e731f19f 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -133,6 +133,18 @@ struct nvmf_transport_ops {
 					struct nvmf_ctrl_options *opts);
 };
 
+static inline bool
+nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
+			struct nvmf_ctrl_options *opts)
+{
+	if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+	    strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
+	    memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
+		return false;
+
+	return true;
+}
+
 int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
 int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
-- 
cgit v1.2.3


From 36e835f2432d8f16845307b5fb7e88d1e5af041d Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 20 Oct 2017 16:17:09 -0700
Subject: nvme-rdma: add support for duplicate_connect option

Adds support for the duplicate_connect option. When set to true,
checks whether there's an existing controller via the same target
address (traddr), target port (trsvcid), and if specified, host
address (host_traddr). Fails the connection request if there is
an existing controller.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 62b58f9b7d00e..32b0a9ef26e67 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1851,6 +1851,83 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.reinit_request		= nvme_rdma_reinit_request,
 };
 
+static inline bool
+__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
+	struct nvmf_ctrl_options *opts)
+{
+	char *stdport = __stringify(NVME_RDMA_IP_PORT);
+
+
+	if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
+	    strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
+		return false;
+
+	if (opts->mask & NVMF_OPT_TRSVCID &&
+	    ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
+			return false;
+	} else if (opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(opts->trsvcid, stdport))
+			return false;
+	} else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
+		if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
+			return false;
+	}
+	/* else, it's a match as both have stdport. Fall to next checks */
+
+	/*
+	 * checking the local address is rough. In most cases, one
+	 * is not specified and the host port is selected by the stack.
+	 *
+	 * Assume no match if:
+	 *  local address is specified and address is not the same
+	 *  local address is not specified but remote is, or vice versa
+	 *    (admin using specific host_traddr when it matters).
+	 */
+	if (opts->mask & NVMF_OPT_HOST_TRADDR &&
+	    ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+		if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
+			return false;
+	} else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
+		   ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
+		return false;
+	/*
+	 * if neither controller had an host port specified, assume it's
+	 * a match as everything else matched.
+	 */
+
+	return true;
+}
+
+/*
+ * Fails a connection request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
+ *
+ * if local address is not specified in the request, it will match an
+ * existing controller with all the other parameters the same and no
+ * local port address specified as well.
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_rdma_ctrl *ctrl;
+	bool found = false;
+
+	mutex_lock(&nvme_rdma_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
+		found = __nvme_rdma_options_match(ctrl, opts);
+		if (found)
+			break;
+	}
+	mutex_unlock(&nvme_rdma_ctrl_mutex);
+
+	return found;
+}
+
 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		struct nvmf_ctrl_options *opts)
 {
@@ -1887,6 +1964,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		}
 	}
 
+	if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
+		ret = -EALREADY;
+		goto out_free_ctrl;
+	}
+
 	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
 				0 /* no quirks, we're perfect! */);
 	if (ret)
-- 
cgit v1.2.3


From 56d5f4f108efd4e439d2320738e2b894af0920bb Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 20 Oct 2017 16:17:08 -0700
Subject: nvme-fc: add support for duplicate_connect option

Adds support for the duplicate_connect option. When set to true,
checks whether there's an existing controller via the same host port
and target port for the same host (hostnqn, hostid) to the same
subsystem. Fails the connection request if an existing controller.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aa9aec6923bb7..d12775d129835 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2792,6 +2792,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
 };
 
 
+/*
+ * Fails a controller request if it matches an existing controller
+ * (association) with the same tuple:
+ * <Host NQN, Host ID, local FC port, remote FC port, SUBSYS NQN>
+ *
+ * The ports don't need to be compared as they are intrinsically
+ * already matched by the port pointers supplied.
+ */
+static bool
+nvme_fc_existing_controller(struct nvme_fc_rport *rport,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	bool found = false;
+
+	spin_lock_irqsave(&rport->lock, flags);
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+		found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts);
+		if (found)
+			break;
+	}
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return found;
+}
+
 static struct nvme_ctrl *
 nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
@@ -2806,6 +2833,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 		goto out_fail;
 	}
 
+	if (!opts->duplicate_connect &&
+	    nvme_fc_existing_controller(rport, opts)) {
+		ret = -EALREADY;
+		goto out_fail;
+	}
+
 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
 	if (!ctrl) {
 		ret = -ENOMEM;
-- 
cgit v1.2.3


From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 23 Oct 2017 15:11:36 -0700
Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS

The define is an arbitrary limit to the io size on the initiator,
capping the io to 1MB-4KB.

Remove the define from the transport. I/O size will solely be limited
by the LLDD sg limits.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 6 ++----
 include/linux/nvme-fc-driver.h | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index d12775d129835..b600c07803cf3 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2435,7 +2435,6 @@ static int
 nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 {
 	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
-	u32 segs;
 	int ret;
 	bool changed;
 
@@ -2486,9 +2485,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	if (ret)
 		goto out_disconnect_admin_queue;
 
-	segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
-			ctrl->lport->ops->max_sgl_segments);
-	ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+	ctrl->ctrl.max_hw_sectors =
+		(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
 
 	ret = nvme_init_identify(&ctrl->ctrl);
 	if (ret)
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 4ea03b9a5c8cb..2be4db3539379 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -102,8 +102,6 @@ enum nvmefc_fcp_datadir {
 };
 
 
-#define NVME_FC_MAX_SEGMENTS		256
-
 /**
  * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport
  *                         to LLDD in order to perform a NVME FCP IO operation.
-- 
cgit v1.2.3


From 4e9b6f20828ac880dbc1fa2fdbafae779473d1af Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 19 Oct 2017 10:00:48 -0700
Subject: block: Fix a race between blk_cleanup_queue() and timeout handling

Make sure that if the timeout timer fires after a queue has been
marked "dying" that the affected requests are finished.

Reported-by: chenxiang (M) <chenxiang66@hisilicon.com>
Fixes: commit 287922eb0b18 ("block: defer timeouts to a workqueue")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: chenxiang (M) <chenxiang66@hisilicon.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c    | 2 ++
 block/blk-timeout.c | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e8e149ccc86b7..bb4fce694a60c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,6 +333,7 @@ EXPORT_SYMBOL(blk_stop_queue);
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->timeout);
+	cancel_work_sync(&q->timeout_work);
 
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
@@ -844,6 +845,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	INIT_WORK(&q->timeout_work, NULL);
 	INIT_LIST_HEAD(&q->queue_head);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index e3e9c9771d361..764ecf9aeb305 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -134,8 +134,6 @@ void blk_timeout_work(struct work_struct *work)
 	struct request *rq, *tmp;
 	int next_set = 0;
 
-	if (blk_queue_enter(q, true))
-		return;
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
@@ -145,7 +143,6 @@ void blk_timeout_work(struct work_struct *work)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-	blk_queue_exit(q);
 }
 
 /**
-- 
cgit v1.2.3


From d59b23795933678c9638fd20c942d2b4f3cd6185 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 30 Oct 2017 14:46:31 -0700
Subject: bcache: only permit to recovery read error when cache device is clean

When bcache does read I/Os, for example in writeback or writethrough mode,
if a read request on cache device is failed, bcache will try to recovery
the request by reading from cached device. If the data on cached device is
not synced with cache device, then requester will get a stale data.

For critical storage system like database, providing stale data from
recovery may result an application level data corruption, which is
unacceptible.

With this patch, for a failed read request in writeback or writethrough
mode, recovery a recoverable read request only happens when cache device
is clean. That is to say, all data on cached device is up to update.

For other cache modes in bcache, read request will never hit
cached_dev_read_error(), they don't need this patch.

Please note, because cache mode can be switched arbitrarily in run time, a
writethrough mode might be switched from a writeback mode. Therefore
checking dc->has_data in writethrough mode still makes sense.

Changelog:
V4: Fix parens error pointed by Michael Lyle.
v3: By response from Kent Oversteet, he thinks recovering stale data is a
    bug to fix, and option to permit it is unnecessary. So this version
    the sysfs file is removed.
v2: rename sysfs entry from allow_stale_data_on_failure  to
    allow_stale_data_on_failure, and fix the confusing commit log.
v1: initial patch posted.

[small change to patch comment spelling by mlyle]

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reported-by: Arne Wolf <awolf@lenovo.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Nix <nix@esperi.org.uk>
Cc: Kai Krakow <hurikhan77@gmail.com>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 163a17a808741..886e4b6643f1b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -705,8 +705,16 @@ static void cached_dev_read_error(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	if (s->recoverable) {
+	/*
+	 * If cache device is dirty (dc->has_dirty is non-zero), then
+	 * recovery a failed read request from cached device may get a
+	 * stale data back. So read failure recovery is only permitted
+	 * when cache device is clean.
+	 */
+	if (s->recoverable &&
+	    (dc && !atomic_read(&dc->has_dirty))) {
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 
-- 
cgit v1.2.3


From 3b304d24a718ae779ee9c7f2014dd3b2d0893b70 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Mon, 30 Oct 2017 14:46:32 -0700
Subject: bcache: convert cached_dev.count from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable cached_dev.count is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    | 7 ++++---
 drivers/md/bcache/super.c     | 6 +++---
 drivers/md/bcache/writeback.h | 2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d77c4829c4972..363ea6256b393 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -184,6 +184,7 @@
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
+#include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
@@ -296,7 +297,7 @@ struct cached_dev {
 	struct semaphore	sb_write_mutex;
 
 	/* Refcount on the cache set. Always nonzero when we're caching. */
-	atomic_t		count;
+	refcount_t		count;
 	struct work_struct	detach;
 
 	/*
@@ -805,13 +806,13 @@ do {									\
 
 static inline void cached_dev_put(struct cached_dev *dc)
 {
-	if (atomic_dec_and_test(&dc->count))
+	if (refcount_dec_and_test(&dc->count))
 		schedule_work(&dc->detach);
 }
 
 static inline bool cached_dev_get(struct cached_dev *dc)
 {
-	if (!atomic_inc_not_zero(&dc->count))
+	if (!refcount_inc_not_zero(&dc->count))
 		return false;
 
 	/* Paired with the mb in cached_dev_attach */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 72c3b2929ef02..46134c45c6f69 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -902,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	closure_init_stack(&cl);
 
 	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
-	BUG_ON(atomic_read(&dc->count));
+	BUG_ON(refcount_read(&dc->count));
 
 	mutex_lock(&bch_register_lock);
 
@@ -1029,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	 * dc->c must be set before dc->count != 0 - paired with the mb in
 	 * cached_dev_get()
 	 */
-	atomic_set(&dc->count, 1);
+	refcount_set(&dc->count, 1);
 
 	/* Block writeback thread, but spawn it */
 	down_write(&dc->writeback_lock);
@@ -1041,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
-		atomic_inc(&dc->count);
+		refcount_inc(&dc->count);
 		bch_writeback_queue(dc);
 	}
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 34bcf49d737bd..7d25bff37a9bf 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -91,7 +91,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 {
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
-		atomic_inc(&dc->count);
+		refcount_inc(&dc->count);
 
 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
-- 
cgit v1.2.3


From d44c2f9e7cc0041f0cd88df1fe7a1fceb713ab14 Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Mon, 30 Oct 2017 14:46:33 -0700
Subject: bcache: update bucket_in_use in real time

bucket_in_use is updated in gc thread which triggered by invalidating or
writing sectors_to_gc dirty data, It's a long interval. Therefore, when we
use it to compare with the threshold, it is often not timely, which leads
to inaccurate judgment and often results in bucket depletion.

We have send a patch before, by the means of updating bucket_in_use
periodically In gc thread, which Coly thought that would lead high
latency, In this patch, we add avail_nbuckets to record the count of
available buckets, and we calculate bucket_in_use when alloc or free
bucket in real time.

[edited by ML: eliminated some whitespace errors]

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c  | 10 ++++++++++
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 17 ++++++++++-------
 drivers/md/bcache/btree.h  |  2 +-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c40870e99f5a..8c5a626343d4d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -442,6 +442,11 @@ out:
 		b->prio = INITIAL_PRIO;
 	}
 
+	if (ca->set->avail_nbuckets > 0) {
+		ca->set->avail_nbuckets--;
+		bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+	}
+
 	return r;
 }
 
@@ -449,6 +454,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
 {
 	SET_GC_MARK(b, 0);
 	SET_GC_SECTORS_USED(b, 0);
+
+	if (ca->set->avail_nbuckets < ca->set->nbuckets) {
+		ca->set->avail_nbuckets++;
+		bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+	}
 }
 
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 363ea6256b393..e274082330dcd 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -581,6 +581,7 @@ struct cache_set {
 	uint8_t			need_gc;
 	struct gc_stat		gc_stats;
 	size_t			nbuckets;
+	size_t			avail_nbuckets;
 
 	struct task_struct	*gc_thread;
 	/* Where in the btree gc currently is */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 866dcf78ff8e6..d8865e6ead370 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1240,6 +1240,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
 	__bch_btree_mark_key(c, level, k);
 }
 
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
+{
+	stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
+}
+
 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 {
 	uint8_t stale = 0;
@@ -1651,9 +1656,8 @@ static void btree_gc_start(struct cache_set *c)
 	mutex_unlock(&c->bucket_lock);
 }
 
-static size_t bch_btree_gc_finish(struct cache_set *c)
+static void bch_btree_gc_finish(struct cache_set *c)
 {
-	size_t available = 0;
 	struct bucket *b;
 	struct cache *ca;
 	unsigned i;
@@ -1690,6 +1694,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
 	}
 	rcu_read_unlock();
 
+	c->avail_nbuckets = 0;
 	for_each_cache(ca, c, i) {
 		uint64_t *i;
 
@@ -1711,18 +1716,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
 			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
 
 			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
-				available++;
+				c->avail_nbuckets++;
 		}
 	}
 
 	mutex_unlock(&c->bucket_lock);
-	return available;
 }
 
 static void bch_btree_gc(struct cache_set *c)
 {
 	int ret;
-	unsigned long available;
 	struct gc_stat stats;
 	struct closure writes;
 	struct btree_op op;
@@ -1745,14 +1748,14 @@ static void bch_btree_gc(struct cache_set *c)
 			pr_warn("gc failed!");
 	} while (ret);
 
-	available = bch_btree_gc_finish(c);
+	bch_btree_gc_finish(c);
 	wake_up_allocators(c);
 
 	bch_time_stats_update(&c->btree_gc_time, start_time);
 
 	stats.key_bytes *= sizeof(uint64_t);
 	stats.data	<<= 9;
-	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
+	bch_update_bucket_in_use(c, &stats);
 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
 
 	trace_bcache_gc_end(c);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 73da1f5626cb8..4073aca09a498 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -305,5 +305,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
 struct keybuf_key *bch_keybuf_next(struct keybuf *);
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
 					  struct bkey *, keybuf_pred_fn *);
-
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
 #endif
-- 
cgit v1.2.3


From c157313791a999646901b3e3c6888514ebc36d62 Mon Sep 17 00:00:00 2001
From: "tang.junhui" <tang.junhui@zte.com.cn>
Date: Mon, 30 Oct 2017 14:46:34 -0700
Subject: bcache: fix wrong cache_misses statistics

Currently, Cache missed IOs are identified by s->cache_miss, but actually,
there are many situations that missed IOs are not assigned a value for
s->cache_miss in cached_dev_cache_miss(), for example, a bypassed IO
(s->iop.bypass = 1), or the cache_bio allocate failed. In these situations,
it will go to out_put or out_submit, and s->cache_miss is null, which leads
bch_mark_cache_accounting() to treat this IO as a hit IO.

[ML: applied by 3-way merge]

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 886e4b6643f1b..597dd1e87beab 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -470,6 +470,7 @@ struct search {
 	unsigned		recoverable:1;
 	unsigned		write:1;
 	unsigned		read_dirty_data:1;
+	unsigned		cache_missed:1;
 
 	unsigned long		start_time;
 
@@ -656,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio,
 
 	s->orig_bio		= bio;
 	s->cache_miss		= NULL;
+	s->cache_missed		= 0;
 	s->d			= d;
 	s->recoverable		= 1;
 	s->write		= op_is_write(bio_op(bio));
@@ -775,7 +777,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	bch_mark_cache_accounting(s->iop.c, s->d,
-				  !s->cache_miss, s->iop.bypass);
+				  !s->cache_missed, s->iop.bypass);
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
 	if (s->iop.status)
@@ -794,6 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
+	s->cache_missed = 1;
+
 	if (s->cache_miss || s->iop.bypass) {
 		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
-- 
cgit v1.2.3


From 330a4db89d39a6b43f36da16824eaa7a7509d34d Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Mon, 30 Oct 2017 14:46:35 -0700
Subject: bcache: explicitly destroy mutex while exiting

mutex_destroy does nothing most of time, but it's better to call
it to make the code future proof and it also has some meaning
for like mutex debug.

As Coly pointed out in a previous review, bcache_exit() may not be
able to handle all the references properly if userspace registers
cache and backing devices right before bch_debug_init runs and
bch_debug_init failes later. So not exposing userspace interface
until everything is ready to avoid that issue.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 46134c45c6f69..b4d28928dec59 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2095,6 +2095,7 @@ static void bcache_exit(void)
 	if (bcache_major)
 		unregister_blkdev(bcache_major, "bcache");
 	unregister_reboot_notifier(&reboot);
+	mutex_destroy(&bch_register_lock);
 }
 
 static int __init bcache_init(void)
@@ -2113,14 +2114,15 @@ static int __init bcache_init(void)
 	bcache_major = register_blkdev(0, "bcache");
 	if (bcache_major < 0) {
 		unregister_reboot_notifier(&reboot);
+		mutex_destroy(&bch_register_lock);
 		return bcache_major;
 	}
 
 	if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
-	    sysfs_create_files(bcache_kobj, files) ||
 	    bch_request_init() ||
-	    bch_debug_init(bcache_kobj))
+	    bch_debug_init(bcache_kobj) ||
+	    sysfs_create_files(bcache_kobj, files))
 		goto err;
 
 	return 0;
-- 
cgit v1.2.3


From 0e7d3a8d4ec3f662faa7ef7f2957459b33f2481d Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Tue, 31 Oct 2017 14:15:02 -0600
Subject: MAINTAINERS: Remove Rafael from Opal maintainers.

He is no longer working on storage.

Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c0bbcf1c81884..6e9343af6bbfa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12041,7 +12041,6 @@ F:	drivers/mmc/host/sdhci-spear.c
 SECURE ENCRYPTING DEVICE (SED) OPAL DRIVER
 M:	Scott Bauer <scott.bauer@intel.com>
 M:	Jonathan Derrick <jonathan.derrick@intel.com>
-M:	Rafael Antognolli <rafael.antognolli@intel.com>
 L:	linux-block@vger.kernel.org
 S:	Supported
 F:	block/sed*
-- 
cgit v1.2.3


From 5e3d02bbafad38975099b5848f5ebadedcf7bb7e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:25 +0800
Subject: blk-mq-sched: dispatch from scheduler IFF progress is made in
 ->dispatch

When the hw queue is busy, we shouldn't take requests from the scheduler
queue any more, otherwise it is difficult to do IO merge.

This patch fixes the awful IO performance on some SCSI devices(lpfc,
qla2xxx, ...) when mq-deadline/kyber is used by not taking requests if
hw queue is busy.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c2..eca011fdfa0ed 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-	bool did_work = false;
+	bool do_sched_dispatch = true;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,18 +125,18 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		did_work = blk_mq_dispatch_rq_list(q, &rq_list);
+		do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
 	} else if (!has_sched_dispatch) {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
 	}
 
 	/*
-	 * We want to dispatch from the scheduler if we had no work left
-	 * on the dispatch list, OR if we did have work but weren't able
-	 * to make progress.
+	 * We want to dispatch from the scheduler if there was nothing
+	 * on the dispatch list or we were able to dispatch from the
+	 * dispatch list.
 	 */
-	if (!did_work && has_sched_dispatch) {
+	if (do_sched_dispatch && has_sched_dispatch) {
 		do {
 			struct request *rq;
 
-- 
cgit v1.2.3


From caf8eb0d604a0eaeb8111eb4d36853a6d08eebe7 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:26 +0800
Subject: blk-mq-sched: move actual dispatching into one helper

So that it becomes easy to support to dispatch from sw queue in the
following patch.

No functional change.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Suggested-by: Christoph Hellwig <hch@lst.de> # for simplifying dispatch logic
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index eca011fdfa0ed..be29ba8494085 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,12 +89,26 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct elevator_queue *e = q->elevator;
+	LIST_HEAD(rq_list);
+
+	do {
+		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+
+		if (!rq)
+			break;
+		list_add(&rq->queuelist, &rq_list);
+	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-	bool do_sched_dispatch = true;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -122,30 +136,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 * scheduler, we can no longer merge or sort them. So it's best to
 	 * leave them there for as long as we can. Mark the hw queue as
 	 * needing a restart in that case.
+	 *
+	 * We want to dispatch from the scheduler if there was nothing
+	 * on the dispatch list or we were able to dispatch from the
+	 * dispatch list.
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		do_sched_dispatch = blk_mq_dispatch_rq_list(q, &rq_list);
-	} else if (!has_sched_dispatch) {
+		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
+			blk_mq_do_dispatch_sched(hctx);
+	} else if (has_sched_dispatch) {
+		blk_mq_do_dispatch_sched(hctx);
+	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list);
 	}
-
-	/*
-	 * We want to dispatch from the scheduler if there was nothing
-	 * on the dispatch list or we were able to dispatch from the
-	 * dispatch list.
-	 */
-	if (do_sched_dispatch && has_sched_dispatch) {
-		do {
-			struct request *rq;
-
-			rq = e->type->ops.mq.dispatch_request(hctx);
-			if (!rq)
-				break;
-			list_add(&rq->queuelist, &rq_list);
-		} while (blk_mq_dispatch_rq_list(q, &rq_list));
-	}
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
cgit v1.2.3


From 7930d0a00ff5dbcc80f793d1a7a6b8de4e591f1a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:27 +0800
Subject: sbitmap: introduce __sbitmap_for_each_set()

For blk-mq, we need to be able to iterate software queues starting
from any queue in a round robin fashion, so introduce this helper.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc450..0dcc60e820de7 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
  */
 bool sbitmap_any_bit_clear(const struct sbitmap *sb);
 
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
 typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
 
 /**
- * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @start: Where to start the iteration.
  * @sb: Bitmap to iterate over.
  * @fn: Callback. Should return true to continue or false to break early.
  * @data: Pointer to pass to callback.
@@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
  * This is inline even though it's non-trivial so that the function calls to the
  * callback will hopefully get optimized away.
  */
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
-					void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+					  unsigned int start,
+					  sb_for_each_fn fn, void *data)
 {
-	unsigned int i;
+	unsigned int index;
+	unsigned int nr;
+	unsigned int scanned = 0;
 
-	for (i = 0; i < sb->map_nr; i++) {
-		struct sbitmap_word *word = &sb->map[i];
-		unsigned int off, nr;
+	if (start >= sb->depth)
+		start = 0;
+	index = SB_NR_TO_INDEX(sb, start);
+	nr = SB_NR_TO_BIT(sb, start);
 
-		if (!word->word)
-			continue;
+	while (scanned < sb->depth) {
+		struct sbitmap_word *word = &sb->map[index];
+		unsigned int depth = min_t(unsigned int, word->depth - nr,
+					   sb->depth - scanned);
 
-		nr = 0;
-		off = i << sb->shift;
+		scanned += depth;
+		if (!word->word)
+			goto next;
+
+		/*
+		 * On the first iteration of the outer loop, we need to add the
+		 * bit offset back to the size of the word for find_next_bit().
+		 * On all other iterations, nr is zero, so this is a noop.
+		 */
+		depth += nr;
 		while (1) {
-			nr = find_next_bit(&word->word, word->depth, nr);
-			if (nr >= word->depth)
+			nr = find_next_bit(&word->word, depth, nr);
+			if (nr >= depth)
 				break;
-
-			if (!fn(sb, off + nr, data))
+			if (!fn(sb, (index << sb->shift) + nr, data))
 				return;
 
 			nr++;
 		}
+next:
+		nr = 0;
+		if (++index >= sb->map_nr)
+			index = 0;
 	}
 }
 
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+					void *data)
+{
+	__sbitmap_for_each_set(sb, 0, fn, data);
+}
 
 static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
 					    unsigned int bitnr)
-- 
cgit v1.2.3


From 63ba8e31c3ac6393b07c6e18538814a730478766 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:28 +0800
Subject: block: kyber: check if there are requests in ctx in kyber_has_work()

There may be request in sw queue, and not fetched to domain queue
yet, so check it in kyber_has_work().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/kyber-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index db5bfc6342d34..b4df317c29169 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -649,7 +649,7 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 		if (!list_empty_careful(&khd->rqs[i]))
 			return true;
 	}
-	return false;
+	return sbitmap_any_bit_set(&hctx->ctx_map);
 }
 
 #define KYBER_LAT_SHOW_STORE(op)					\
-- 
cgit v1.2.3


From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:29 +0800
Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops

For SCSI devices, there is often a per-request-queue depth, which needs
to be respected before queuing one request.

Currently blk-mq always dequeues the request first, then calls
.queue_rq() to dispatch the request to lld. One obvious issue with this
approach is that I/O merging may not be successful, because when the
per-request-queue depth can't be respected, .queue_rq() has to return
BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch
list. This means it never gets a chance to be merged with other IO.

This patch introduces .get_budget and .put_budget callback in blk_mq_ops,
then we can try to get reserved budget first before dequeuing request.
If the budget for queueing I/O can't be satisfied, we don't need to
dequeue request at all. Hence the request can be left in the IO
scheduler queue, for more merging opportunities.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 55 +++++++++++++++++++++++++++++++++++++++++---------
 block/blk-mq-sched.h   |  2 +-
 block/blk-mq.c         | 43 ++++++++++++++++++++++++++++++++++-----
 block/blk-mq.h         | 20 +++++++++++++++++-
 include/linux/blk-mq.h | 11 ++++++++++
 5 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index be29ba8494085..8e525e66a0d97 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,31 +89,57 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
 
 	do {
-		struct request *rq = e->type->ops.mq.dispatch_request(hctx);
+		struct request *rq;
+		blk_status_t ret;
 
-		if (!rq)
+		if (e->type->ops.mq.has_work &&
+				!e->type->ops.mq.has_work(hctx))
 			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = e->type->ops.mq.dispatch_request(hctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
 		list_add(&rq->queuelist, &rq_list);
-	} while (blk_mq_dispatch_rq_list(q, &rq_list));
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	return false;
 }
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+/* return true if hw queue need to be run again */
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
+	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return;
+		return false;
 
 	hctx->run++;
 
@@ -143,14 +169,23 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list) && has_sched_dispatch)
-			blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
+				has_sched_dispatch)
+			run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else if (has_sched_dispatch) {
-		blk_mq_do_dispatch_sched(hctx);
+		run_queue = blk_mq_do_dispatch_sched(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
-		blk_mq_dispatch_rq_list(q, &rq_list);
+		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
+
+	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
+			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
+		blk_mq_sched_mark_restart_hctx(hctx);
+		return true;
+	}
+
+	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9267d0b7c1978..2434061cc5b75 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40cba1b1978f2..dcb467369999e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1048,7 +1048,8 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
+bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
+		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
@@ -1057,6 +1058,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 	if (list_empty(list))
 		return false;
 
+	WARN_ON(!list_is_singular(list) && got_budget);
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1074,16 +1077,30 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx))
+			if (!blk_mq_dispatch_wait_add(hctx)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
 				break;
+			}
 
 			/*
 			 * It's possible that a tag was freed in the window
 			 * between the allocation failure and adding the
 			 * hardware queue to the wait queue.
 			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+				if (got_budget)
+					blk_mq_put_dispatch_budget(hctx);
+				break;
+			}
+		}
+
+		if (!got_budget) {
+			ret = blk_mq_get_dispatch_budget(hctx);
+			if (ret == BLK_STS_RESOURCE)
 				break;
+			if (ret != BLK_STS_OK)
+				goto fail_rq;
 		}
 
 		list_del_init(&rq->queuelist);
@@ -1111,6 +1128,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			break;
 		}
 
+ fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1169,6 +1187,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
+	bool run_queue;
 
 	/*
 	 * We should be running this queue from one of the CPUs that
@@ -1185,15 +1204,18 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		blk_mq_sched_dispatch_requests(hctx);
+		run_queue = blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
@@ -1582,6 +1604,13 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
+	ret = blk_mq_get_dispatch_budget(hctx);
+	if (ret == BLK_STS_RESOURCE) {
+		blk_mq_put_driver_tag(rq);
+		goto insert;
+	} else if (ret != BLK_STS_OK)
+		goto fail_rq;
+
 	new_cookie = request_to_qc_t(hctx, rq);
 
 	/*
@@ -1598,6 +1627,7 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
+ fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
@@ -2582,6 +2612,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (!set->ops->queue_rq)
 		return -EINVAL;
 
+	if (!set->ops->get_budget ^ !set->ops->put_budget)
+		return -EINVAL;
+
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da54..e413b732374e6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
+bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
@@ -137,4 +137,22 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 			unsigned int inflight[2]);
 
+static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->put_budget)
+		q->mq_ops->put_budget(hctx);
+}
+
+static inline blk_status_t blk_mq_get_dispatch_budget(
+		struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+
+	if (q->mq_ops->get_budget)
+		return q->mq_ops->get_budget(hctx);
+	return BLK_STS_OK;
+}
+
 #endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f5..901457df3d64d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,6 +90,8 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
@@ -111,6 +113,15 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * Reserve budget before queue request, once .queue_rq is
+	 * run, it is driver's responsibility to release the
+	 * reserved budget. Also we have to handle failure case
+	 * of .get_budget for avoiding I/O deadlock.
+	 */
+	get_budget_fn		*get_budget;
+	put_budget_fn		*put_budget;
+
 	/*
 	 * Called on request timeout
 	 */
-- 
cgit v1.2.3


From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:30 +0800
Subject: blk-mq-sched: improve dispatching from sw queue

SCSI devices use host-wide tagset, and the shared driver tag space is
often quite big. However, there is also a queue depth for each lun(
.cmd_per_lun), which is often small, for example, on both lpfc and
qla2xxx, .cmd_per_lun is just 3.

So lots of requests may stay in sw queue, and we always flush all
belonging to same hw queue and dispatch them all to driver.
Unfortunately it is easy to cause queue busy because of the small
.cmd_per_lun.  Once these requests are flushed out, they have to stay in
hctx->dispatch, and no bio merge can happen on these requests, and
sequential IO performance is harmed.

This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request
from a sw queue, so that we can dispatch them in scheduler's way. We can
then avoid dequeueing too many requests from sw queue, since we don't
flush ->dispatch completely.

This patch improves dispatching from sw queue by using the .get_budget
and .put_budget callbacks.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   | 74 ++++++++++++++++++++++++++++++++++++++++++++++++--
 block/blk-mq.c         | 39 ++++++++++++++++++++++++++
 block/blk-mq.h         |  2 ++
 include/linux/blk-mq.h |  2 ++
 4 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 8e525e66a0d97..df8581bb0a370 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -128,6 +128,61 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_ctx *ctx)
+{
+	unsigned idx = ctx->index_hw;
+
+	if (++idx == hctx->nr_ctx)
+		idx = 0;
+
+	return hctx->ctxs[idx];
+}
+
+/* return true if hctx need to run again */
+static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	LIST_HEAD(rq_list);
+	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+
+	do {
+		struct request *rq;
+		blk_status_t ret;
+
+		if (!sbitmap_any_bit_set(&hctx->ctx_map))
+			break;
+
+		ret = blk_mq_get_dispatch_budget(hctx);
+		if (ret == BLK_STS_RESOURCE)
+			return true;
+
+		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
+		if (!rq) {
+			blk_mq_put_dispatch_budget(hctx);
+			break;
+		} else if (ret != BLK_STS_OK) {
+			blk_mq_end_request(rq, ret);
+			continue;
+		}
+
+		/*
+		 * Now this rq owns the budget which has to be released
+		 * if this rq won't be queued to driver via .queue_rq()
+		 * in blk_mq_dispatch_rq_list().
+		 */
+		list_add(&rq->queuelist, &rq_list);
+
+		/* round robin for fair dispatch */
+		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+	WRITE_ONCE(hctx->dispatch_from, ctx);
+
+	return false;
+}
+
 /* return true if hw queue need to be run again */
 bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
@@ -169,11 +224,24 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	 */
 	if (!list_empty(&rq_list)) {
 		blk_mq_sched_mark_restart_hctx(hctx);
-		if (blk_mq_dispatch_rq_list(q, &rq_list, false) &&
-				has_sched_dispatch)
-			run_queue = blk_mq_do_dispatch_sched(hctx);
+		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
+			if (has_sched_dispatch)
+				run_queue = blk_mq_do_dispatch_sched(hctx);
+			else
+				run_queue = blk_mq_do_dispatch_ctx(hctx);
+		}
 	} else if (has_sched_dispatch) {
 		run_queue = blk_mq_do_dispatch_sched(hctx);
+	} else if (q->mq_ops->get_budget) {
+		/*
+		 * If we need to get budget before queuing request, we
+		 * dequeue request one by one from sw queue for avoiding
+		 * to mess up I/O merge when dispatch runs out of resource.
+		 *
+		 * TODO: get more budgets, and dequeue more requests in
+		 * one time.
+		 */
+		run_queue = blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dcb467369999e..097ca3ece716e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -914,6 +914,45 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
+		void *data)
+{
+	struct dispatch_rq_data *dispatch_data = data;
+	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	spin_lock(&ctx->lock);
+	if (unlikely(!list_empty(&ctx->rq_list))) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+		list_del_init(&dispatch_data->rq->queuelist);
+		if (list_empty(&ctx->rq_list))
+			sbitmap_clear_bit(sb, bitnr);
+	}
+	spin_unlock(&ctx->lock);
+
+	return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start)
+{
+	unsigned off = start ? start->index_hw : 0;
+	struct dispatch_rq_data data = {
+		.hctx = hctx,
+		.rq   = NULL,
+	};
+
+	__sbitmap_for_each_set(&hctx->ctx_map, off,
+			       dispatch_rq_from_ctx, &data);
+
+	return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
 	if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e413b732374e6..522b420dedc07 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
+struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
+					struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 901457df3d64d..e5e6becd57d3c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
 	struct sbitmap		ctx_map;
 
+	struct blk_mq_ctx	*dispatch_from;
+
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-- 
cgit v1.2.3


From aeec77629a4ac6f8c248f3a82e80d4170a881f22 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:31 +0800
Subject: scsi: allow passing in null rq to scsi_prep_state_check()

In the following patch, we will implement scsi_get_budget()
which need to call scsi_prep_state_check() when rq isn't
dequeued yet.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe2975..d159bb0857144 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 			/*
 			 * If the devices is blocked we defer normal commands.
 			 */
-			if (!(req->rq_flags & RQF_PREEMPT))
+			if (req && !(req->rq_flags & RQF_PREEMPT))
 				ret = BLKPREP_DEFER;
 			break;
 		default:
@@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 			 * special commands.  In particular any user initiated
 			 * command is not allowed.
 			 */
-			if (!(req->rq_flags & RQF_PREEMPT))
+			if (req && !(req->rq_flags & RQF_PREEMPT))
 				ret = BLKPREP_KILL;
 			break;
 		}
-- 
cgit v1.2.3


From 0df21c86bdbfd17dec9ab898312af9bfb74d5d86 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 14 Oct 2017 17:22:32 +0800
Subject: scsi: implement .get_budget and .put_budget for blk-mq

We need to tell blk-mq to reserve resources before queuing one request,
so implement these two callbacks. Then blk-mq can avoid to dequeue
request too early, and IO merging can be improved a lot.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 75 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d159bb0857144..6f10afaca25b2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 	blk_mq_complete_request(cmd->request);
 }
 
-static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 {
-	struct request *req = bd->rq;
-	struct request_queue *q = req->q;
+	struct request_queue *q = hctx->queue;
+	struct scsi_device *sdev = q->queuedata;
+	struct Scsi_Host *shost = sdev->host;
+
+	atomic_dec(&shost->host_busy);
+	if (scsi_target(sdev)->can_queue > 0)
+		atomic_dec(&scsi_target(sdev)->target_busy);
+	atomic_dec(&sdev->device_busy);
+	put_device(&sdev->sdev_gendev);
+}
+
+static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
 	struct Scsi_Host *shost = sdev->host;
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
-	int reason;
 
-	ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-	if (ret != BLK_STS_OK)
-		goto out;
+	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
+	if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
+		return ret;
 
-	ret = BLK_STS_RESOURCE;
 	if (!get_device(&sdev->sdev_gendev))
 		goto out;
-
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
 	if (!scsi_target_queue_ready(shost, sdev))
@@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!scsi_host_queue_ready(q, shost, sdev))
 		goto out_dec_target_busy;
 
+	return BLK_STS_OK;
+
+out_dec_target_busy:
+	if (scsi_target(sdev)->can_queue > 0)
+		atomic_dec(&scsi_target(sdev)->target_busy);
+out_dec_device_busy:
+	atomic_dec(&sdev->device_busy);
+out_put_device:
+	put_device(&sdev->sdev_gendev);
+out:
+	return BLK_STS_RESOURCE;
+}
+
+static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct request *req = bd->rq;
+	struct request_queue *q = req->q;
+	struct scsi_device *sdev = q->queuedata;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+	int reason;
+
+	ret = prep_to_mq(scsi_prep_state_check(sdev, req));
+	if (ret != BLK_STS_OK)
+		goto out_put_budget;
+
+	ret = BLK_STS_RESOURCE;
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
 		if (ret != BLK_STS_OK)
-			goto out_dec_host_busy;
+			goto out_put_budget;
 		req->rq_flags |= RQF_DONTPREP;
 	} else {
 		blk_mq_start_request(req);
@@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (reason) {
 		scsi_set_blocked(cmd, reason);
 		ret = BLK_STS_RESOURCE;
-		goto out_dec_host_busy;
+		goto out_put_budget;
 	}
 
 	return BLK_STS_OK;
 
-out_dec_host_busy:
-	atomic_dec(&shost->host_busy);
-out_dec_target_busy:
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
-	atomic_dec(&sdev->device_busy);
-out_put_device:
-	put_device(&sdev->sdev_gendev);
-out:
+out_put_budget:
+	scsi_mq_put_budget(hctx);
 	switch (ret) {
 	case BLK_STS_OK:
 		break;
@@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
 }
 
 static const struct blk_mq_ops scsi_mq_ops = {
+	.get_budget	= scsi_mq_get_budget,
+	.put_budget	= scsi_mq_put_budget,
 	.queue_rq	= scsi_queue_rq,
 	.complete	= scsi_softirq_done,
 	.timeout	= scsi_timeout,
-- 
cgit v1.2.3


From 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 27 Oct 2017 12:43:29 +0800
Subject: blk-mq: don't handle TAG_SHARED in restart

Now restart is used in the following cases, and TAG_SHARED is for
SCSI only.

1) .get_budget() returns BLK_STS_RESOURCE
- if resource in target/host level isn't satisfied, this SCSI device
will be added in shost->starved_list, and the whole queue will be rerun
(via SCSI's built-in RESTART) in scsi_end_request() after any request
initiated from this host/targe is completed. Forget to mention, host level
resource can't be an issue for blk-mq at all.

- the same is true if resource in the queue level isn't satisfied.

- if there isn't outstanding request on this queue, then SCSI's RESTART
can't work(blk-mq's can't work too), and the queue will be run after
SCSI_QUEUE_DELAY, and finally all starved sdevs will be handled by SCSI's
RESTART when this request is finished

2) scsi_dispatch_cmd() returns BLK_STS_RESOURCE
- if there isn't onprogressing request on this queue, the queue
will be run after SCSI_QUEUE_DELAY

- otherwise, SCSI's RESTART covers the rerun.

3) blk_mq_get_driver_tag() failed
- BLK_MQ_S_TAG_WAITING covers the cross-queue RESTART for driver
allocation.

In one word, SCSI's built-in RESTART is enough to cover the queue
rerun, and we don't need to pay special attention to TAG_SHARED wrt. restart.

In my test on scsi_debug(8 luns), this patch improves IOPS by 20% ~ 30% when
running I/O on these 8 luns concurrently.

Aslo Roman Pen reported the current RESTART is very expensive especialy
when there are lots of LUNs attached in one host, such as in his
test, RESTART causes half of IOPS be cut.

Fixes: https://marc.info/?l=linux-kernel&m=150832216727524&w=2
Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 78 +++-------------------------------------------------
 1 file changed, 4 insertions(+), 74 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index df8581bb0a370..daab27feb6539 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,25 +68,17 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return false;
-
-	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
-		struct request_queue *q = hctx->queue;
+		return;
 
-		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-			atomic_dec(&q->shared_hctx_restart);
-	} else
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
 	if (blk_mq_hctx_has_pending(hctx)) {
 		blk_mq_run_hw_queue(hctx, true);
-		return true;
+		return;
 	}
-
-	return false;
 }
 
 /* return true if hctx need to run again */
@@ -385,68 +377,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-/**
- * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
- * @pos:    loop cursor.
- * @skip:   the list element that will not be examined. Iteration starts at
- *          @skip->next.
- * @head:   head of the list to examine. This list must have at least one
- *          element, namely @skip.
- * @member: name of the list_head structure within typeof(*pos).
- */
-#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
-	for ((pos) = (skip);						\
-	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
-			(pos)->member.next, typeof(*pos), member) :	\
-	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
-	     (pos) != (skip); )
-
-/*
- * Called after a driver tag has been freed to check whether a hctx needs to
- * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
- * queues in a round-robin fashion if the tag set of @hctx is shared with other
- * hardware queues.
- */
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
-{
-	struct blk_mq_tags *const tags = hctx->tags;
-	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
-	struct request_queue *const queue = hctx->queue, *q;
-	struct blk_mq_hw_ctx *hctx2;
-	unsigned int i, j;
-
-	if (set->flags & BLK_MQ_F_TAG_SHARED) {
-		/*
-		 * If this is 0, then we know that no hardware queues
-		 * have RESTART marked. We're done.
-		 */
-		if (!atomic_read(&queue->shared_hctx_restart))
-			return;
-
-		rcu_read_lock();
-		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
-					   tag_set_list) {
-			queue_for_each_hw_ctx(q, hctx2, i)
-				if (hctx2->tags == tags &&
-				    blk_mq_sched_restart_hctx(hctx2))
-					goto done;
-		}
-		j = hctx->queue_num + 1;
-		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
-			if (j == queue->nr_hw_queues)
-				j = 0;
-			hctx2 = queue->queue_hw_ctx[j];
-			if (hctx2->tags == tags &&
-			    blk_mq_sched_restart_hctx(hctx2))
-				break;
-		}
-done:
-		rcu_read_unlock();
-	} else {
-		blk_mq_sched_restart_hctx(hctx);
-	}
-}
-
 /*
  * Add flush/fua to the queue. If we fail getting a driver tag, then
  * punt to the requeue list. Requeue will re-invoke us from a context
-- 
cgit v1.2.3


From 1f460b63d4b37f504d8d0affc2cd492eb005ea97 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 27 Oct 2017 12:43:30 +0800
Subject: blk-mq: don't restart queue when .get_budget returns BLK_STS_RESOURCE

SCSI restarts its queue in scsi_end_request() automatically, so we don't
need to handle this case in blk-mq.

Especailly any request won't be dequeued in this case, we needn't to
worry about IO hang caused by restart vs. dispatch.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 45 ++++++++++++++++++++-------------------------
 block/blk-mq-sched.h |  2 +-
 block/blk-mq.c       |  8 ++------
 3 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index daab27feb6539..7775f6b12fa94 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,8 +81,12 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
-/* return true if hctx need to run again */
-static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
@@ -98,7 +102,7 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 		ret = blk_mq_get_dispatch_budget(hctx);
 		if (ret == BLK_STS_RESOURCE)
-			return true;
+			break;
 
 		rq = e->type->ops.mq.dispatch_request(hctx);
 		if (!rq) {
@@ -116,8 +120,6 @@ static bool blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 */
 		list_add(&rq->queuelist, &rq_list);
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
-
-	return false;
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -131,8 +133,12 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 	return hctx->ctxs[idx];
 }
 
-/* return true if hctx need to run again */
-static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+/*
+ * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
+ * its queue by itself in its completion handler, so we don't need to
+ * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ */
+static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	LIST_HEAD(rq_list);
@@ -147,7 +153,7 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 		ret = blk_mq_get_dispatch_budget(hctx);
 		if (ret == BLK_STS_RESOURCE)
-			return true;
+			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
@@ -171,22 +177,19 @@ static bool blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 
 	WRITE_ONCE(hctx->dispatch_from, ctx);
-
-	return false;
 }
 
 /* return true if hw queue need to be run again */
-bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
 	LIST_HEAD(rq_list);
-	bool run_queue = false;
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
-		return false;
+		return;
 
 	hctx->run++;
 
@@ -218,12 +221,12 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		blk_mq_sched_mark_restart_hctx(hctx);
 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 			if (has_sched_dispatch)
-				run_queue = blk_mq_do_dispatch_sched(hctx);
+				blk_mq_do_dispatch_sched(hctx);
 			else
-				run_queue = blk_mq_do_dispatch_ctx(hctx);
+				blk_mq_do_dispatch_ctx(hctx);
 		}
 	} else if (has_sched_dispatch) {
-		run_queue = blk_mq_do_dispatch_sched(hctx);
+		blk_mq_do_dispatch_sched(hctx);
 	} else if (q->mq_ops->get_budget) {
 		/*
 		 * If we need to get budget before queuing request, we
@@ -233,19 +236,11 @@ bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 		 * TODO: get more budgets, and dequeue more requests in
 		 * one time.
 		 */
-		run_queue = blk_mq_do_dispatch_ctx(hctx);
+		blk_mq_do_dispatch_ctx(hctx);
 	} else {
 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
 		blk_mq_dispatch_rq_list(q, &rq_list, false);
 	}
-
-	if (run_queue && !blk_mq_sched_needs_restart(hctx) &&
-			!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) {
-		blk_mq_sched_mark_restart_hctx(hctx);
-		return true;
-	}
-
-	return false;
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 2434061cc5b75..9267d0b7c1978 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -22,7 +22,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
-bool blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 097ca3ece716e..e4d2490f4e7ee 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1226,7 +1226,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
-	bool run_queue;
 
 	/*
 	 * We should be running this queue from one of the CPUs that
@@ -1243,18 +1242,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		run_queue = blk_mq_sched_dispatch_requests(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		might_sleep();
 
 		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		run_queue = blk_mq_sched_dispatch_requests(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
-- 
cgit v1.2.3


From 2a750166a5bee7fda186c12269e37ba52b26c017 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Mon, 30 Oct 2017 09:02:19 -0700
Subject: block: Rework drivers/cdrom/Makefile

Instead of referring from inside drivers/cdrom/Makefile to all the
drivers that use this driver, let these drivers select the cdrom
driver. This change makes the cdrom build code follow the approach
that is used for most other drivers, namely refer from the higher
layers to the lower layer instead of from the lower layer to the
higher layers.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig        | 12 ++++++++++++
 drivers/block/paride/Kconfig |  1 +
 drivers/cdrom/Makefile       | 15 ++-------------
 drivers/ide/Kconfig          |  1 +
 drivers/scsi/Kconfig         |  1 +
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 4a438b8abe27a..b998b3c9ea860 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -67,9 +67,20 @@ config AMIGA_Z2RAM
 	  To compile this driver as a module, choose M here: the
 	  module will be called z2ram.
 
+config CDROM
+	tristate "CD-ROM driver"
+	help
+	  A CD-ROM is a pre-pressed optical compact disc which contains
+	  data. The name is an acronym which stands for "Compact Disc
+	  Read-Only Memory". This driver implements functionality that is
+	  shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the
+	  SCSI CD-ROM driver. For a list of the ioctls implemented by this
+	  driver, see also Documentation/ioctl/cdrom.txt.
+
 config GDROM
 	tristate "SEGA Dreamcast GD-ROM drive"
 	depends on SH_DREAMCAST
+	select CDROM
 	select BLK_SCSI_REQUEST # only for the generic cdrom code
 	help
 	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
@@ -347,6 +358,7 @@ config BLK_DEV_RAM_DAX
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media (DEPRECATED)"
 	depends on !UML
+	select CDROM
 	select BLK_SCSI_REQUEST
 	help
 	  Note: This driver is deprecated and will be removed from the
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index 3a15247942e41..1d5057f5080ba 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -25,6 +25,7 @@ config PARIDE_PD
 config PARIDE_PCD
 	tristate "Parallel port ATAPI CD-ROMs"
 	depends on PARIDE
+	select CDROM
 	select BLK_SCSI_REQUEST # only for the generic cdrom code
 	---help---
 	  This option enables the high-level driver for ATAPI CD-ROM devices
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile
index 8ffde4f8ab9ae..7f3f43cc22574 100644
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -1,13 +1,2 @@
-# Makefile for the kernel cdrom device drivers.
-#
-# 30 Jan 1998, Michael Elizabeth Chastain, <mailto:mec@shout.net>
-# Rewritten to use lists instead of if-statements.
-
-# Each configuration option enables a list of files.
-
-obj-$(CONFIG_BLK_DEV_IDECD)	+=              cdrom.o
-obj-$(CONFIG_BLK_DEV_SR)	+=              cdrom.o
-obj-$(CONFIG_PARIDE_PCD)	+=		cdrom.o
-obj-$(CONFIG_CDROM_PKTCDVD)	+=		cdrom.o
-
-obj-$(CONFIG_GDROM)		+= gdrom.o      cdrom.o
+obj-$(CONFIG_CDROM)	+= cdrom.o
+obj-$(CONFIG_GDROM)	+= gdrom.o
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index c99a25c075bcb..7b92c591e4c11 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -118,6 +118,7 @@ config BLK_DEV_DELKIN
 config BLK_DEV_IDECD
 	tristate "Include IDE/ATAPI CDROM support"
 	select IDE_ATAPI
+	select CDROM
 	---help---
 	  If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is
 	  a newer protocol used by IDE CD-ROM and TAPE drives, similar to the
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 41366339b9501..cca0b03c2eadf 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -131,6 +131,7 @@ config CHR_DEV_OSST
 config BLK_DEV_SR
 	tristate "SCSI CDROM support"
 	depends on SCSI
+	select CDROM
 	---help---
 	  If you want to use a CD or DVD drive attached to your computer
 	  by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO
-- 
cgit v1.2.3


From 71c691fd06cc2625966620c93ce21bdcce32ed95 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 28 Sep 2017 09:56:31 -0700
Subject: nvme-fc: avoid workqueue flush stalls

There's no need to wait for the full nvme_wq, which is now shared,
to flush. flush only the delete_work item.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sgi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b600c07803cf3..50cc17e99475e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2693,7 +2693,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
 	nvme_get_ctrl(&ctrl->ctrl);
 	ret = __nvme_fc_del_ctrl(ctrl);
 	if (!ret)
-		flush_workqueue(nvme_wq);
+		flush_work(&ctrl->delete_work);
 	nvme_put_ctrl(&ctrl->ctrl);
 
 	return ret;
-- 
cgit v1.2.3


From 29c09648734b94e42802f021658a47361169403b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 29 Oct 2017 10:44:28 +0200
Subject: nvme-fc: merge __nvme_fc_schedule_delete_work into __nvme_fc_del_ctrl

No need to have two functions doing the same thing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 50cc17e99475e..827e9efbe5564 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2663,22 +2663,14 @@ nvme_fc_delete_ctrl_work(struct work_struct *work)
 	nvme_put_ctrl(&ctrl->ctrl);
 }
 
-static bool
-__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl)
-{
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-		return true;
-
-	if (!queue_work(nvme_wq, &ctrl->delete_work))
-		return true;
-
-	return false;
-}
-
 static int
 __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
 {
-	return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0;
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+	if (!queue_work(nvme_wq, &ctrl->delete_work))
+		return -EBUSY;
+	return 0;
 }
 
 /*
@@ -2724,7 +2716,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 				"NVME-FC{%d}: Max reconnect attempts (%d) "
 				"reached. Removing controller\n",
 				ctrl->cnum, ctrl->ctrl.nr_reconnects);
-		WARN_ON(__nvme_fc_schedule_delete_work(ctrl));
+		WARN_ON(__nvme_fc_del_ctrl(ctrl));
 	}
 }
 
-- 
cgit v1.2.3


From c5017e85705bfea721732e153305d1988ff965c2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 29 Oct 2017 10:44:29 +0200
Subject: nvme: move controller deletion to common code

Move the ->delete_work and the associated helpers to common code instead
of duplicating them in every driver.  This also adds the missing reference
get/put for the loop driver.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c    | 38 ++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/fabrics.c |  2 +-
 drivers/nvme/host/fc.c      | 42 +++++----------------------------------
 drivers/nvme/host/nvme.h    |  5 ++++-
 drivers/nvme/host/rdma.c    | 45 ++++++------------------------------------
 drivers/nvme/target/loop.c  | 48 +++++++++------------------------------------
 6 files changed, 62 insertions(+), 118 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index df525ab42fcd1..d835ac05bbf7b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -97,6 +97,41 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, delete_work);
+
+	ctrl->ops->delete_ctrl(ctrl);
+}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+	if (!queue_work(nvme_wq, &ctrl->delete_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+	int ret = 0;
+
+	/*
+	 * Keep a reference until the work is flushed since ->delete_ctrl
+	 * can free the controller.
+	 */
+	nvme_get_ctrl(ctrl);
+	ret = nvme_delete_ctrl(ctrl);
+	if (!ret)
+		flush_work(&ctrl->delete_work);
+	nvme_put_ctrl(ctrl);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
+
 static blk_status_t nvme_error_status(struct request *req)
 {
 	switch (nvme_req(req)->status & 0x7ff) {
@@ -2122,7 +2157,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev,
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
 	if (device_remove_file_self(dev, attr))
-		ctrl->ops->delete_ctrl(ctrl);
+		nvme_delete_ctrl_sync(ctrl);
 	return count;
 }
 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
@@ -2702,6 +2737,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
 
 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 4a83137b02684..a4967de5bb255 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -887,7 +887,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 			"controller returned incorrect NQN: \"%s\".\n",
 			ctrl->subnqn);
 		up_read(&nvmf_transports_rwsem);
-		ctrl->ops->delete_ctrl(ctrl);
+		nvme_delete_ctrl_sync(ctrl);
 		return ERR_PTR(-EINVAL);
 	}
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 827e9efbe5564..a7bdb17de29d4 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -157,7 +157,6 @@ struct nvme_fc_ctrl {
 	struct blk_mq_tag_set	admin_tag_set;
 	struct blk_mq_tag_set	tag_set;
 
-	struct work_struct	delete_work;
 	struct delayed_work	connect_work;
 
 	struct kref		ref;
@@ -223,7 +222,6 @@ static struct device *fc_udev_device;
 
 /* *********************** FC-NVME Port Management ************************ */
 
-static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
 static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
 			struct nvme_fc_queue *, unsigned int);
 
@@ -662,7 +660,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 
 	/* tear down all associations to the remote port */
 	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
-		__nvme_fc_del_ctrl(ctrl);
+		nvme_delete_ctrl(&ctrl->ctrl);
 
 	spin_unlock_irqrestore(&rport->lock, flags);
 
@@ -2636,10 +2634,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 }
 
 static void
-nvme_fc_delete_ctrl_work(struct work_struct *work)
+nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
-	struct nvme_fc_ctrl *ctrl =
-		container_of(work, struct nvme_fc_ctrl, delete_work);
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
 	cancel_work_sync(&ctrl->ctrl.reset_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
@@ -2663,34 +2660,6 @@ nvme_fc_delete_ctrl_work(struct work_struct *work)
 	nvme_put_ctrl(&ctrl->ctrl);
 }
 
-static int
-__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
-{
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-		return -EBUSY;
-	if (!queue_work(nvme_wq, &ctrl->delete_work))
-		return -EBUSY;
-	return 0;
-}
-
-/*
- * Request from nvme core layer to delete the controller
- */
-static int
-nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
-{
-	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
-	int ret;
-
-	nvme_get_ctrl(&ctrl->ctrl);
-	ret = __nvme_fc_del_ctrl(ctrl);
-	if (!ret)
-		flush_work(&ctrl->delete_work);
-	nvme_put_ctrl(&ctrl->ctrl);
-
-	return ret;
-}
-
 static void
 nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 {
@@ -2716,7 +2685,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 				"NVME-FC{%d}: Max reconnect attempts (%d) "
 				"reached. Removing controller\n",
 				ctrl->cnum, ctrl->ctrl.nr_reconnects);
-		WARN_ON(__nvme_fc_del_ctrl(ctrl));
+		WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
 	}
 }
 
@@ -2748,7 +2717,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
 	.reg_write32		= nvmf_reg_write32,
 	.free_ctrl		= nvme_fc_nvme_ctrl_freed,
 	.submit_async_event	= nvme_fc_submit_async_event,
-	.delete_ctrl		= nvme_fc_del_nvme_ctrl,
+	.delete_ctrl		= nvme_fc_delete_ctrl,
 	.get_address		= nvmf_get_address,
 	.reinit_request		= nvme_fc_reinit_request,
 };
@@ -2851,7 +2820,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	get_device(ctrl->dev);
 	kref_init(&ctrl->ref);
 
-	INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
 	INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
 	spin_lock_init(&ctrl->lock);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1bb2bc165e543..35d9cee515f1b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -138,6 +138,7 @@ struct nvme_ctrl {
 	struct cdev cdev;
 	struct ida ns_ida;
 	struct work_struct reset_work;
+	struct work_struct delete_work;
 
 	struct opal_dev *opal_dev;
 
@@ -236,7 +237,7 @@ struct nvme_ctrl_ops {
 	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 	void (*free_ctrl)(struct nvme_ctrl *ctrl);
 	void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
-	int (*delete_ctrl)(struct nvme_ctrl *ctrl);
+	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 };
@@ -339,6 +340,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 32b0a9ef26e67..5175b465997d0 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -105,7 +105,6 @@ struct nvme_rdma_ctrl {
 
 	/* other member variables */
 	struct blk_mq_tag_set	tag_set;
-	struct work_struct	delete_work;
 	struct work_struct	err_work;
 
 	struct nvme_rdma_qe	async_event_sqe;
@@ -913,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 				ctrl->ctrl.opts->reconnect_delay * HZ);
 	} else {
 		dev_info(ctrl->ctrl.device, "Removing controller...\n");
-		queue_work(nvme_wq, &ctrl->delete_work);
+		queue_work(nvme_wq, &ctrl->ctrl.delete_work);
 	}
 }
 
@@ -1764,41 +1763,10 @@ static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl)
 	nvme_put_ctrl(&ctrl->ctrl);
 }
 
-static void nvme_rdma_del_ctrl_work(struct work_struct *work)
+static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
 {
-	struct nvme_rdma_ctrl *ctrl = container_of(work,
-				struct nvme_rdma_ctrl, delete_work);
-
-	nvme_stop_ctrl(&ctrl->ctrl);
-	nvme_rdma_remove_ctrl(ctrl);
-}
-
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
-{
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-		return -EBUSY;
-
-	if (!queue_work(nvme_wq, &ctrl->delete_work))
-		return -EBUSY;
-
-	return 0;
-}
-
-static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
-{
-	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-	int ret = 0;
-
-	/*
-	 * Keep a reference until all work is flushed since
-	 * __nvme_rdma_del_ctrl can free the ctrl mem
-	 */
-	nvme_get_ctrl(&ctrl->ctrl);
-	ret = __nvme_rdma_del_ctrl(ctrl);
-	if (!ret)
-		flush_work(&ctrl->delete_work);
-	nvme_put_ctrl(&ctrl->ctrl);
-	return ret;
+	nvme_stop_ctrl(ctrl);
+	nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl));
 }
 
 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
@@ -1846,7 +1814,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.reg_write32		= nvmf_reg_write32,
 	.free_ctrl		= nvme_rdma_free_ctrl,
 	.submit_async_event	= nvme_rdma_submit_async_event,
-	.delete_ctrl		= nvme_rdma_del_ctrl,
+	.delete_ctrl		= nvme_rdma_delete_ctrl,
 	.get_address		= nvmf_get_address,
 	.reinit_request		= nvme_rdma_reinit_request,
 };
@@ -1977,7 +1945,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	INIT_DELAYED_WORK(&ctrl->reconnect_work,
 			nvme_rdma_reconnect_ctrl_work);
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
-	INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
 
 	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
@@ -2081,7 +2048,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
 		dev_info(ctrl->ctrl.device,
 			"Removing ctrl: NQN \"%s\", addr %pISp\n",
 			ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
-		__nvme_rdma_del_ctrl(ctrl);
+		nvme_delete_ctrl(&ctrl->ctrl);
 	}
 	mutex_unlock(&nvme_rdma_ctrl_mutex);
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index f83e925fe64a6..7f9f3fc3fb2a8 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -53,7 +53,6 @@ struct nvme_loop_ctrl {
 	struct nvme_ctrl	ctrl;
 
 	struct nvmet_ctrl	*target_ctrl;
-	struct work_struct	delete_work;
 };
 
 static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -439,41 +438,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 	nvme_loop_destroy_admin_queue(ctrl);
 }
 
-static void nvme_loop_del_ctrl_work(struct work_struct *work)
+static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
 {
-	struct nvme_loop_ctrl *ctrl = container_of(work,
-				struct nvme_loop_ctrl, delete_work);
-
-	nvme_stop_ctrl(&ctrl->ctrl);
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_loop_shutdown_ctrl(ctrl);
-	nvme_uninit_ctrl(&ctrl->ctrl);
-	nvme_put_ctrl(&ctrl->ctrl);
-}
-
-static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl)
-{
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
-		return -EBUSY;
-
-	if (!queue_work(nvme_wq, &ctrl->delete_work))
-		return -EBUSY;
-
-	return 0;
-}
-
-static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl)
-{
-	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl);
-	int ret;
-
-	ret = __nvme_loop_del_ctrl(ctrl);
-	if (ret)
-		return ret;
-
-	flush_work(&ctrl->delete_work);
-
-	return 0;
+	nvme_stop_ctrl(ctrl);
+	nvme_remove_namespaces(ctrl);
+	nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
+	nvme_uninit_ctrl(ctrl);
+	nvme_put_ctrl(ctrl);
 }
 
 static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
@@ -483,7 +454,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
 	mutex_lock(&nvme_loop_ctrl_mutex);
 	list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) {
 		if (ctrl->ctrl.cntlid == nctrl->cntlid)
-			__nvme_loop_del_ctrl(ctrl);
+			nvme_delete_ctrl(&ctrl->ctrl);
 	}
 	mutex_unlock(&nvme_loop_ctrl_mutex);
 }
@@ -539,7 +510,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
 	.reg_write32		= nvmf_reg_write32,
 	.free_ctrl		= nvme_loop_free_ctrl,
 	.submit_async_event	= nvme_loop_submit_async_event,
-	.delete_ctrl		= nvme_loop_del_ctrl,
+	.delete_ctrl		= nvme_loop_delete_ctrl_host,
 };
 
 static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -601,7 +572,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 	ctrl->ctrl.opts = opts;
 	INIT_LIST_HEAD(&ctrl->list);
 
-	INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work);
 
 	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
@@ -731,7 +701,7 @@ static void __exit nvme_loop_cleanup_module(void)
 
 	mutex_lock(&nvme_loop_ctrl_mutex);
 	list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list)
-		__nvme_loop_del_ctrl(ctrl);
+		nvme_delete_ctrl(&ctrl->ctrl);
 	mutex_unlock(&nvme_loop_ctrl_mutex);
 
 	flush_workqueue(nvme_wq);
-- 
cgit v1.2.3


From e9bc25874c0bde47b65c58ccd01e339a603a7f40 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 29 Oct 2017 10:44:30 +0200
Subject: nvme-rdma: remove nvme_rdma_remove_ctrl

It is only used in two places, and some of the work done by it will
be taken into common code soon.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 5175b465997d0..a3521b852ea85 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1755,18 +1755,13 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 	nvme_rdma_destroy_admin_queue(ctrl, shutdown);
 }
 
-static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl)
-{
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_rdma_shutdown_ctrl(ctrl, true);
-	nvme_uninit_ctrl(&ctrl->ctrl);
-	nvme_put_ctrl(&ctrl->ctrl);
-}
-
 static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_stop_ctrl(ctrl);
-	nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl));
+	nvme_remove_namespaces(ctrl);
+	nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
+	nvme_uninit_ctrl(ctrl);
+	nvme_put_ctrl(ctrl);
 }
 
 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
@@ -1802,7 +1797,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 
 out_fail:
 	dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
-	nvme_rdma_remove_ctrl(ctrl);
+	nvme_remove_namespaces(&ctrl->ctrl);
+	nvme_rdma_shutdown_ctrl(ctrl, true);
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
 }
 
 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
-- 
cgit v1.2.3


From 6cd53d14aaa006b5543f06fbf5e1680ce61c6c6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 29 Oct 2017 10:44:31 +0200
Subject: nvme: consolidate common code from ->reset_work

No change in behavior except that the FC code cancels two work items a
little later now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c   |  4 ++++
 drivers/nvme/host/fc.c     | 13 -------------
 drivers/nvme/host/rdma.c   |  4 ----
 drivers/nvme/target/loop.c |  4 ----
 4 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d835ac05bbf7b..4fa748c9a3f6d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -102,7 +102,11 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, delete_work);
 
+	nvme_stop_ctrl(ctrl);
+	nvme_remove_namespaces(ctrl);
 	ctrl->ops->delete_ctrl(ctrl);
+	nvme_uninit_ctrl(ctrl);
+	nvme_put_ctrl(ctrl);
 }
 
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a7bdb17de29d4..e447b532b9ee8 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2640,24 +2640,11 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 
 	cancel_work_sync(&ctrl->ctrl.reset_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
-	nvme_stop_ctrl(&ctrl->ctrl);
-	nvme_remove_namespaces(&ctrl->ctrl);
 	/*
 	 * kill the association on the link side.  this will block
 	 * waiting for io to terminate
 	 */
 	nvme_fc_delete_association(ctrl);
-
-	/*
-	 * tear down the controller
-	 * After the last reference on the nvme ctrl is removed,
-	 * the transport nvme_fc_nvme_ctrl_freed() callback will be
-	 * invoked. From there, the transport will tear down it's
-	 * logical queues and association.
-	 */
-	nvme_uninit_ctrl(&ctrl->ctrl);
-
-	nvme_put_ctrl(&ctrl->ctrl);
 }
 
 static void
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a3521b852ea85..ed6e05018a922 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1757,11 +1757,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 
 static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
 {
-	nvme_stop_ctrl(ctrl);
-	nvme_remove_namespaces(ctrl);
 	nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
-	nvme_uninit_ctrl(ctrl);
-	nvme_put_ctrl(ctrl);
 }
 
 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 7f9f3fc3fb2a8..bc95c6ed531af 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -440,11 +440,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 
 static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
 {
-	nvme_stop_ctrl(ctrl);
-	nvme_remove_namespaces(ctrl);
 	nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
-	nvme_uninit_ctrl(ctrl);
-	nvme_put_ctrl(ctrl);
 }
 
 static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
-- 
cgit v1.2.3


From 12fa1304badde26c5cf495d5af48df22a31c3a52 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 29 Oct 2017 14:21:01 +0200
Subject: nvme-rdma: reuse nvme_delete_ctrl when reconnect attempts expire

instead of just queueing delete work

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ed6e05018a922..03644ecf68d29 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -912,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 				ctrl->ctrl.opts->reconnect_delay * HZ);
 	} else {
 		dev_info(ctrl->ctrl.device, "Removing controller...\n");
-		queue_work(nvme_wq, &ctrl->ctrl.delete_work);
+		nvme_delete_ctrl(&ctrl->ctrl);
 	}
 }
 
-- 
cgit v1.2.3


From 4054637c9b4fbe9feef0cf6f2516ef00d8053560 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 29 Oct 2017 14:21:02 +0200
Subject: nvme: flush reset_work before safely continuing with delete operation

Prevent racing controller reset and delete flows. reset_work must not
ever self-requeue so flushing it suffices.

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 1 +
 drivers/nvme/host/fc.c   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4fa748c9a3f6d..003314eb6341a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -102,6 +102,7 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, delete_work);
 
+	flush_work(&ctrl->reset_work);
 	nvme_stop_ctrl(ctrl);
 	nvme_remove_namespaces(ctrl);
 	ctrl->ops->delete_ctrl(ctrl);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e447b532b9ee8..6a025a8d8c32c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2638,7 +2638,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
-	cancel_work_sync(&ctrl->ctrl.reset_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
 	/*
 	 * kill the association on the link side.  this will block
-- 
cgit v1.2.3


From 44c6ec77e12c387aaba420b30a54b94966f0d9e8 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:14 -0700
Subject: nvme-fc: change ctlr state assignments during reset/reconnect

Clean up some of the controller state checks and add the
RESETTING->RECONNECTING state transition.

Specifically:
- the movement of the RESETTING state change and schedule of reset_work
  to core doesn't work wiht nvme_fc_error_recovery setting state to
  RECONNECTING before attempting to reset.  Remove the state change as
  the reset request does it.
- In the rare cases where an error occurs right as we're transitioning
  to LIVE, defer the controller start actions.
- In error handling on teardown of associations while performing initial
  controller creation - avoid quiesce calls on the admin_q.  They are
  unneeded.
- Add the RESETTING->RECONNECTING transition in the reset handler.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6a025a8d8c32c..e37c69f7921d3 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1884,13 +1884,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 	dev_warn(ctrl->ctrl.device,
 		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
 
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
-		dev_err(ctrl->ctrl.device,
-			"NVME-FC{%d}: error_recovery: Couldn't change state "
-			"to RECONNECTING\n", ctrl->cnum);
-		return;
-	}
-
 	nvme_reset_ctrl(&ctrl->ctrl);
 }
 
@@ -2528,11 +2521,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	}
 
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
-	WARN_ON_ONCE(!changed);
 
 	ctrl->ctrl.nr_reconnects = 0;
 
-	nvme_start_ctrl(&ctrl->ctrl);
+	if (changed)
+		nvme_start_ctrl(&ctrl->ctrl);
 
 	return 0;	/* Success */
 
@@ -2600,7 +2593,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	 * use blk_mq_tagset_busy_itr() and the transport routine to
 	 * terminate the exchanges.
 	 */
-	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	if (ctrl->ctrl.state != NVME_CTRL_NEW)
+		blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_fc_terminate_exchange, &ctrl->ctrl);
 
@@ -2649,12 +2643,8 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 static void
 nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 {
-	/* If we are resetting/deleting then do nothing */
-	if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
-		WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
-			ctrl->ctrl.state == NVME_CTRL_LIVE);
+	if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
 		return;
-	}
 
 	dev_info(ctrl->ctrl.device,
 		"NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
@@ -2683,9 +2673,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 	int ret;
 
 	nvme_stop_ctrl(&ctrl->ctrl);
+
 	/* will block will waiting for io to terminate */
 	nvme_fc_delete_association(ctrl);
 
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+		dev_err(ctrl->ctrl.device,
+			"NVME-FC{%d}: error_recovery: Couldn't change state "
+			"to RECONNECTING\n", ctrl->cnum);
+		return;
+	}
+
 	ret = nvme_fc_create_association(ctrl);
 	if (ret)
 		nvme_fc_reconnect_or_delete(ctrl, ret);
-- 
cgit v1.2.3


From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:15 -0700
Subject: nvme-fc: add a dev_loss_tmo field to the remoteport

Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device
connectivity loss.

The transport initializes the value in the nvme_fc_register_remoteport()
call. If the value is not set, a default of 60s is set.

Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine,
which allows the lldd to dynamically update the value on an existing
remoteport.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c         | 31 +++++++++++++++++++++++++++++++
 include/linux/nvme-fc-driver.h | 11 +++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e37c69f7921d3..25479d3031fad 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -45,6 +45,8 @@ enum nvme_fc_queue_flags {
 
 #define NVMEFC_QUEUE_DELAY	3		/* ms units */
 
+#define NVME_FC_DEFAULT_DEV_LOSS_TMO	60	/* seconds */
+
 struct nvme_fc_queue {
 	struct nvme_fc_ctrl	*ctrl;
 	struct device		*dev;
@@ -585,6 +587,11 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	newrec->remoteport.port_id = pinfo->port_id;
 	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
 	newrec->remoteport.port_num = idx;
+	/* a registration value of dev_loss_tmo=0 results in the default */
+	if (pinfo->dev_loss_tmo)
+		newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+	else
+		newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
 
 	spin_lock_irqsave(&nvme_fc_lock, flags);
 	list_add_tail(&newrec->endp_list, &lport->endp_list);
@@ -688,6 +695,30 @@ nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport);
 
+int
+nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
+			u32 dev_loss_tmo)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+
+	/* a dev_loss_tmo of 0 (immediate) is allowed to be set */
+	rport->remoteport.dev_loss_tmo = dev_loss_tmo;
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
+
 
 /* *********************** FC-NVME DMA Handling **************************** */
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 2be4db3539379..496ff759f84c6 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -40,6 +40,8 @@
  * @node_name: FC WWNN for the port
  * @port_name: FC WWPN for the port
  * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx)
+ * @dev_loss_tmo: maximum delay for reconnects to an association on
+ *             this device. Used only on a remoteport.
  *
  * Initialization values for dynamic port fields:
  * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
@@ -50,6 +52,7 @@ struct nvme_fc_port_info {
 	u64			port_name;
 	u32			port_role;
 	u32			port_id;
+	u32			dev_loss_tmo;
 };
 
 
@@ -200,6 +203,9 @@ enum nvme_fc_obj_state {
  *             The length of the buffer corresponds to the local_priv_sz
  *             value specified in the nvme_fc_port_template supplied by
  *             the LLDD.
+ * @dev_loss_tmo: maximum delay for reconnects to an association on
+ *             this device. To modify, lldd must call
+ *             nvme_fc_set_remoteport_devloss().
  *
  * Fields with dynamic values. Values may change base on link state. LLDD
  * may reference fields directly to change them. Initialized by the
@@ -257,10 +263,9 @@ struct nvme_fc_remote_port {
 	u32 port_role;
 	u64 node_name;
 	u64 port_name;
-
 	struct nvme_fc_local_port *localport;
-
 	void *private;
+	u32 dev_loss_tmo;
 
 	/* dynamic fields */
 	u32 port_id;
@@ -446,6 +451,8 @@ int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
 
 void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport);
 
+int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport,
+			u32 dev_loss_tmo);
 
 
 /*
-- 
cgit v1.2.3


From 96e24801056467c6483f68ef71d642fb925c3100 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:16 -0700
Subject: nvme-fc: check connectivity before initiating reconnects

Check remoteport connectivity before initiating reconnects

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 25479d3031fad..b64cc10024520 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -808,7 +808,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 		dma_unmap_sg(dev, sg, nents, dir);
 }
 
-
 /* *********************** FC-NVME LS Handling **************************** */
 
 static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
@@ -2462,6 +2461,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 
 	++ctrl->ctrl.nr_reconnects;
 
+	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+		return -ENODEV;
+
 	/*
 	 * Create the admin queue
 	 */
@@ -2682,6 +2684,10 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 		ctrl->cnum, status);
 
 	if (nvmf_should_reconnect(&ctrl->ctrl)) {
+		/* Only schedule the reconnect if the remote port is online */
+		if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+			return;
+
 		dev_info(ctrl->ctrl.device,
 			"NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
 			ctrl->cnum, ctrl->ctrl.opts->reconnect_delay);
@@ -2715,12 +2721,15 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	ret = nvme_fc_create_association(ctrl);
-	if (ret)
-		nvme_fc_reconnect_or_delete(ctrl, ret);
-	else
-		dev_info(ctrl->ctrl.device,
-			"NVME-FC{%d}: controller reset complete\n", ctrl->cnum);
+	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
+		ret = nvme_fc_create_association(ctrl);
+		if (ret)
+			nvme_fc_reconnect_or_delete(ctrl, ret);
+		else
+			dev_info(ctrl->ctrl.device,
+				"NVME-FC{%d}: controller reset complete\n",
+				ctrl->cnum);
+	}
 }
 
 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
-- 
cgit v1.2.3


From 3cec7f9de448131778a1428100621fd371db75f4 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:13 -0700
Subject: nvme: allow controller RESETTING to RECONNECTING transition

Transport will typically transition from LIVE to RESETTING when initially
performing a reset or recovering from an error.  Adding this transition
allows a transport to transition to RECONNECTING when it checks/waits for
connectivity then creates new transport connections and reinits the
controller.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 003314eb6341a..07b9f4e1c283d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -241,6 +241,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	case NVME_CTRL_RECONNECTING:
 		switch (old_state) {
 		case NVME_CTRL_LIVE:
+		case NVME_CTRL_RESETTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
-- 
cgit v1.2.3


From 2b632970da4f288e6dbdc826c34fbf74f40ec94c Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 25 Oct 2017 16:43:17 -0700
Subject: nvme-fc: add dev_loss_tmo timeout and remoteport resume support

When a remoteport is unregistered (connectivity lost), the following
actions are taken:

 - the remoteport is marked DELETED
 - the time when dev_loss_tmo would expire is set in the remoteport
 - all controllers on the remoteport are reset.

After a controller resets, it will stall in a RECONNECTING state waiting
for one of the following:

 - the controller will continue to attempt reconnect per max_retries and
   reconnect_delay.  As no remoteport connectivity, the reconnect attempt
   will immediately fail.  If max reconnects has not been reached, a new
   reconnect_delay timer will be schedule.  If the current time plus
   another reconnect_delay exceeds when dev_loss_tmo expires on the remote
   port, then the reconnect_delay will be shortend to schedule no later
   than when dev_loss_tmo expires.  If max reconnect attempts are reached
   (e.g. ctrl_loss_tmo reached) or dev_loss_tmo ix exceeded without
   connectivity, the controller is deleted.
 - the remoteport is re-registered prior to dev_loss_tmo expiring.
   The resume of the remoteport will immediately attempt to reconnect
   each of its suspended controllers.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
[hch: updated to use nvme_delete_ctrl]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 278 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 239 insertions(+), 39 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b64cc10024520..113c30be7276b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -138,6 +138,7 @@ struct nvme_fc_rport {
 	struct nvme_fc_lport		*lport;
 	spinlock_t			lock;
 	struct kref			ref;
+	unsigned long			dev_loss_end;
 } __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
 
 enum nvme_fcctrl_flags {
@@ -528,6 +529,102 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
 	return kref_get_unless_zero(&rport->ref);
 }
 
+static void
+nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
+{
+	switch (ctrl->ctrl.state) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_RECONNECTING:
+		/*
+		 * As all reconnects were suppressed, schedule a
+		 * connect.
+		 */
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: connectivity re-established. "
+			"Attempting reconnect\n", ctrl->cnum);
+
+		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
+		break;
+
+	case NVME_CTRL_RESETTING:
+		/*
+		 * Controller is already in the process of terminating the
+		 * association. No need to do anything further. The reconnect
+		 * step will naturally occur after the reset completes.
+		 */
+		break;
+
+	default:
+		/* no action to take - let it delete */
+		break;
+	}
+}
+
+static struct nvme_fc_rport *
+nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
+				struct nvme_fc_port_info *pinfo)
+{
+	struct nvme_fc_rport *rport;
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	list_for_each_entry(rport, &lport->endp_list, endp_list) {
+		if (rport->remoteport.node_name != pinfo->node_name ||
+		    rport->remoteport.port_name != pinfo->port_name)
+			continue;
+
+		if (!nvme_fc_rport_get(rport)) {
+			rport = ERR_PTR(-ENOLCK);
+			goto out_done;
+		}
+
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+		spin_lock_irqsave(&rport->lock, flags);
+
+		/* has it been unregistered */
+		if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) {
+			/* means lldd called us twice */
+			spin_unlock_irqrestore(&rport->lock, flags);
+			nvme_fc_rport_put(rport);
+			return ERR_PTR(-ESTALE);
+		}
+
+		rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
+		rport->dev_loss_end = 0;
+
+		/*
+		 * kick off a reconnect attempt on all associations to the
+		 * remote port. A successful reconnects will resume i/o.
+		 */
+		list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
+			nvme_fc_resume_controller(ctrl);
+
+		spin_unlock_irqrestore(&rport->lock, flags);
+
+		return rport;
+	}
+
+	rport = NULL;
+
+out_done:
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return rport;
+}
+
+static inline void
+__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport,
+			struct nvme_fc_port_info *pinfo)
+{
+	if (pinfo->dev_loss_tmo)
+		rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
+	else
+		rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
+}
+
 /**
  * nvme_fc_register_remoteport - transport entry point called by an
  *                              LLDD to register the existence of a NVME
@@ -554,22 +651,45 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	unsigned long flags;
 	int ret, idx;
 
+	if (!nvme_fc_lport_get(lport)) {
+		ret = -ESHUTDOWN;
+		goto out_reghost_failed;
+	}
+
+	/*
+	 * look to see if there is already a remoteport that is waiting
+	 * for a reconnect (within dev_loss_tmo) with the same WWN's.
+	 * If so, transition to it and reconnect.
+	 */
+	newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo);
+
+	/* found an rport, but something about its state is bad */
+	if (IS_ERR(newrec)) {
+		ret = PTR_ERR(newrec);
+		goto out_lport_put;
+
+	/* found existing rport, which was resumed */
+	} else if (newrec) {
+		nvme_fc_lport_put(lport);
+		__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
+		nvme_fc_signal_discovery_scan(lport, newrec);
+		*portptr = &newrec->remoteport;
+		return 0;
+	}
+
+	/* nothing found - allocate a new remoteport struct */
+
 	newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
 			 GFP_KERNEL);
 	if (!newrec) {
 		ret = -ENOMEM;
-		goto out_reghost_failed;
-	}
-
-	if (!nvme_fc_lport_get(lport)) {
-		ret = -ESHUTDOWN;
-		goto out_kfree_rport;
+		goto out_lport_put;
 	}
 
 	idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
 	if (idx < 0) {
 		ret = -ENOSPC;
-		goto out_lport_put;
+		goto out_kfree_rport;
 	}
 
 	INIT_LIST_HEAD(&newrec->endp_list);
@@ -587,11 +707,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	newrec->remoteport.port_id = pinfo->port_id;
 	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
 	newrec->remoteport.port_num = idx;
-	/* a registration value of dev_loss_tmo=0 results in the default */
-	if (pinfo->dev_loss_tmo)
-		newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo;
-	else
-		newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO;
+	__nvme_fc_set_dev_loss_tmo(newrec, pinfo);
 
 	spin_lock_irqsave(&nvme_fc_lock, flags);
 	list_add_tail(&newrec->endp_list, &lport->endp_list);
@@ -602,10 +718,10 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	*portptr = &newrec->remoteport;
 	return 0;
 
-out_lport_put:
-	nvme_fc_lport_put(lport);
 out_kfree_rport:
 	kfree(newrec);
+out_lport_put:
+	nvme_fc_lport_put(lport);
 out_reghost_failed:
 	*portptr = NULL;
 	return ret;
@@ -636,6 +752,58 @@ restart:
 	return 0;
 }
 
+static void
+nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
+{
+	dev_info(ctrl->ctrl.device,
+		"NVME-FC{%d}: controller connectivity lost. Awaiting "
+		"Reconnect", ctrl->cnum);
+
+	switch (ctrl->ctrl.state) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_LIVE:
+		/*
+		 * Schedule a controller reset. The reset will terminate the
+		 * association and schedule the reconnect timer.  Reconnects
+		 * will be attempted until either the ctlr_loss_tmo
+		 * (max_retries * connect_delay) expires or the remoteport's
+		 * dev_loss_tmo expires.
+		 */
+		if (nvme_reset_ctrl(&ctrl->ctrl)) {
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: Couldn't schedule reset. "
+				"Deleting controller.\n",
+				ctrl->cnum);
+			nvme_delete_ctrl(&ctrl->ctrl);
+		}
+		break;
+
+	case NVME_CTRL_RECONNECTING:
+		/*
+		 * The association has already been terminated and the
+		 * controller is attempting reconnects.  No need to do anything
+		 * futher.  Reconnects will be attempted until either the
+		 * ctlr_loss_tmo (max_retries * connect_delay) expires or the
+		 * remoteport's dev_loss_tmo expires.
+		 */
+		break;
+
+	case NVME_CTRL_RESETTING:
+		/*
+		 * Controller is already in the process of terminating the
+		 * association.  No need to do anything further. The reconnect
+		 * step will kick in naturally after the association is
+		 * terminated.
+		 */
+		break;
+
+	case NVME_CTRL_DELETING:
+	default:
+		/* no action to take - let it delete */
+		break;
+	}
+}
+
 /**
  * nvme_fc_unregister_remoteport - transport entry point called by an
  *                              LLDD to deregister/remove a previously
@@ -665,15 +833,31 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 	}
 	portptr->port_state = FC_OBJSTATE_DELETED;
 
-	/* tear down all associations to the remote port */
-	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
-		nvme_delete_ctrl(&ctrl->ctrl);
+	rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ);
+
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+		/* if dev_loss_tmo==0, dev loss is immediate */
+		if (!portptr->dev_loss_tmo) {
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: controller connectivity lost. "
+				"Deleting controller.\n",
+				ctrl->cnum);
+			nvme_delete_ctrl(&ctrl->ctrl);
+		} else
+			nvme_fc_ctrl_connectivity_loss(ctrl);
+	}
 
 	spin_unlock_irqrestore(&rport->lock, flags);
 
 	nvme_fc_abort_lsops(rport);
 
+	/*
+	 * release the reference, which will allow, if all controllers
+	 * go away, which should only occur after dev_loss_tmo occurs,
+	 * for the rport to be torn down.
+	 */
 	nvme_fc_rport_put(rport);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
@@ -700,7 +884,6 @@ nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr,
 			u32 dev_loss_tmo)
 {
 	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
-	struct nvme_fc_ctrl *ctrl;
 	unsigned long flags;
 
 	spin_lock_irqsave(&rport->lock, flags);
@@ -2676,28 +2859,43 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 static void
 nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 {
+	struct nvme_fc_rport *rport = ctrl->rport;
+	struct nvme_fc_remote_port *portptr = &rport->remoteport;
+	unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
+	bool recon = true;
+
 	if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
 		return;
 
-	dev_info(ctrl->ctrl.device,
-		"NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
-		ctrl->cnum, status);
+	if (portptr->port_state == FC_OBJSTATE_ONLINE)
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n",
+			ctrl->cnum, status);
+	else if (time_after_eq(jiffies, rport->dev_loss_end))
+		recon = false;
 
-	if (nvmf_should_reconnect(&ctrl->ctrl)) {
-		/* Only schedule the reconnect if the remote port is online */
-		if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
-			return;
+	if (recon && nvmf_should_reconnect(&ctrl->ctrl)) {
+		if (portptr->port_state == FC_OBJSTATE_ONLINE)
+			dev_info(ctrl->ctrl.device,
+				"NVME-FC{%d}: Reconnect attempt in %ld "
+				"seconds\n",
+				ctrl->cnum, recon_delay / HZ);
+		else if (time_after(jiffies + recon_delay, rport->dev_loss_end))
+			recon_delay = rport->dev_loss_end - jiffies;
 
-		dev_info(ctrl->ctrl.device,
-			"NVME-FC{%d}: Reconnect attempt in %d seconds.\n",
-			ctrl->cnum, ctrl->ctrl.opts->reconnect_delay);
-		queue_delayed_work(nvme_wq, &ctrl->connect_work,
-				ctrl->ctrl.opts->reconnect_delay * HZ);
+		queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay);
 	} else {
-		dev_warn(ctrl->ctrl.device,
+		if (portptr->port_state == FC_OBJSTATE_ONLINE)
+			dev_warn(ctrl->ctrl.device,
 				"NVME-FC{%d}: Max reconnect attempts (%d) "
 				"reached. Removing controller\n",
 				ctrl->cnum, ctrl->ctrl.nr_reconnects);
+		else
+			dev_warn(ctrl->ctrl.device,
+				"NVME-FC{%d}: dev_loss_tmo (%d) expired "
+				"while waiting for remoteport connectivity. "
+				"Removing controller\n", ctrl->cnum,
+				portptr->dev_loss_tmo);
 		WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
 	}
 }
@@ -2721,15 +2919,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
+	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE)
 		ret = nvme_fc_create_association(ctrl);
-		if (ret)
-			nvme_fc_reconnect_or_delete(ctrl, ret);
-		else
-			dev_info(ctrl->ctrl.device,
-				"NVME-FC{%d}: controller reset complete\n",
-				ctrl->cnum);
-	}
+	else
+		ret = -ENOTCONN;
+
+	if (ret)
+		nvme_fc_reconnect_or_delete(ctrl, ret);
+	else
+		dev_info(ctrl->ctrl.device,
+			"NVME-FC{%d}: controller reset complete\n",
+			ctrl->cnum);
 }
 
 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
-- 
cgit v1.2.3


From a96d4bd867129e6124e3cd1006fcbcfea3341179 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 27 Oct 2017 13:12:49 -0700
Subject: nvmet: fix fatal_err_work deadlock

Below is a stack trace for an issue that was reported.

What's happening is that the nvmet layer had it's controller kato
timeout fire, which causes it to schedule its fatal error handler
via the fatal_err_work element. The error handler is invoked, which
calls the transport delete_ctrl() entry point, and as the transport
tears down the controller, nvmet_sq_destroy ends up doing the final
put on the ctlr causing it to enter its free routine. The ctlr free
routine does a cancel_work_sync() on fatal_err_work element, which
then does a flush_work and wait_for_completion. But, as the wait is
in the context of the work element being flushed, its in a catch-22
and the thread hangs.

[  326.903131] nvmet: ctrl 1 keep-alive timer (15 seconds) expired!
[  326.909832] nvmet: ctrl 1 fatal error occurred!
[  327.643100] lpfc 0000:04:00.0: 0:6313 NVMET Defer ctx release xri
x114 flg x2
[  494.582064] INFO: task kworker/0:2:243 blocked for more than 120
seconds.
[  494.589638]       Not tainted 4.14.0-rc1.James+ #1
[  494.594986] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[  494.603718] kworker/0:2     D    0   243      2 0x80000000
[  494.609839] Workqueue: events nvmet_fatal_error_handler [nvmet]
[  494.616447] Call Trace:
[  494.619177]  __schedule+0x28d/0x890
[  494.623070]  schedule+0x36/0x80
[  494.626571]  schedule_timeout+0x1dd/0x300
[  494.631044]  ? dequeue_task_fair+0x592/0x840
[  494.635810]  ? pick_next_task_fair+0x23b/0x5c0
[  494.640756]  wait_for_completion+0x121/0x180
[  494.645521]  ? wake_up_q+0x80/0x80
[  494.649315]  flush_work+0x11d/0x1a0
[  494.653206]  ? wake_up_worker+0x30/0x30
[  494.657484]  __cancel_work_timer+0x10b/0x190
[  494.662249]  cancel_work_sync+0x10/0x20
[  494.666525]  nvmet_ctrl_put+0xa3/0x100 [nvmet]
[  494.671482]  nvmet_sq_:q+0x64/0xd0 [nvmet]
[  494.676540]  nvmet_fc_delete_target_queue+0x202/0x220 [nvmet_fc]
[  494.683245]  nvmet_fc_delete_target_assoc+0x6d/0xc0 [nvmet_fc]
[  494.689743]  nvmet_fc_delete_ctrl+0x137/0x1a0 [nvmet_fc]
[  494.695673]  nvmet_fatal_error_handler+0x30/0x40 [nvmet]
[  494.701589]  process_one_work+0x149/0x360
[  494.706064]  worker_thread+0x4d/0x3c0
[  494.710148]  kthread+0x109/0x140
[  494.713751]  ? rescuer_thread+0x380/0x380
[  494.718214]  ? kthread_park+0x60/0x60

Correct by having the fc transport convert to a different workq context
for the actual controller teardown which may call the cancel_work_sync.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 58e010bdda3ea..5f2570f5dfa90 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -150,6 +150,7 @@ struct nvmet_fc_tgt_assoc {
 	struct list_head		a_list;
 	struct nvmet_fc_tgt_queue	*queues[NVMET_NR_QUEUES + 1];
 	struct kref			ref;
+	struct work_struct		del_work;
 };
 
 
@@ -232,6 +233,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
 static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
 static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 					struct nvmet_fc_fcp_iod *fod);
+static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc);
 
 
 /* *********************** FC-NVME DMA Handling **************************** */
@@ -802,6 +804,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
 	return NULL;
 }
 
+static void
+nvmet_fc_delete_assoc(struct work_struct *work)
+{
+	struct nvmet_fc_tgt_assoc *assoc =
+		container_of(work, struct nvmet_fc_tgt_assoc, del_work);
+
+	nvmet_fc_delete_target_assoc(assoc);
+	nvmet_fc_tgt_a_put(assoc);
+}
+
 static struct nvmet_fc_tgt_assoc *
 nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
 {
@@ -826,6 +838,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
 	assoc->a_id = idx;
 	INIT_LIST_HEAD(&assoc->a_list);
 	kref_init(&assoc->ref);
+	INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
 
 	while (needrandom) {
 		get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID);
@@ -1118,8 +1131,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
 		nvmet_fc_tgtport_put(tgtport);
 
 		if (found_ctrl) {
-			nvmet_fc_delete_target_assoc(assoc);
-			nvmet_fc_tgt_a_put(assoc);
+			schedule_work(&assoc->del_work);
 			return;
 		}
 
-- 
cgit v1.2.3


From 3639efef8fb128f99ae38bf603189639efc3850d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 20 Oct 2017 16:18:03 -0600
Subject: nvme: Remove unused headers

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7735571ffc9ab..32c413ec818c9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -13,7 +13,6 @@
  */
 
 #include <linux/aer.h>
-#include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-mq-pci.h>
@@ -26,12 +25,9 @@
 #include <linux/mutex.h>
 #include <linux/once.h>
 #include <linux/pci.h>
-#include <linux/poison.h>
 #include <linux/t10-pi.h>
-#include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
-#include <asm/unaligned.h>
 #include <linux/sed-opal.h>
 
 #include "nvme.h"
-- 
cgit v1.2.3


From c091fbe9a26ee116d99e2c0ed010afb957a10365 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:19:32 +0100
Subject: block: fix CDROM dependency on BLK_DEV

After the cdrom cleanup, I get randconfig warnings for some configurations:

warning: (BLK_DEV_IDECD && BLK_DEV_SR) selects CDROM which has unmet direct dependencies (BLK_DEV)

This adds an explicit BLK_DEV dependency for both drivers. The other
drivers that select 'CDROM' already have this and don't need a change.

Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/Kconfig  | 1 +
 drivers/scsi/Kconfig | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 7b92c591e4c11..cf1fb3fb5d26f 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -117,6 +117,7 @@ config BLK_DEV_DELKIN
 
 config BLK_DEV_IDECD
 	tristate "Include IDE/ATAPI CDROM support"
+	depends on BLK_DEV
 	select IDE_ATAPI
 	select CDROM
 	---help---
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index cca0b03c2eadf..7669553180058 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -130,7 +130,7 @@ config CHR_DEV_OSST
 
 config BLK_DEV_SR
 	tristate "SCSI CDROM support"
-	depends on SCSI
+	depends on SCSI && BLK_DEV
 	select CDROM
 	---help---
 	  If you want to use a CD or DVD drive attached to your computer
-- 
cgit v1.2.3


From 474f5da2354e9fd5376b968e4c5a0f1807dc19e8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:42:00 +0100
Subject: skd: use ktime_get_real_seconds()

Like many storage drivers, skd uses an unsigned 32-bit number for
interchanging the current time with the firmware. This will overflow in
y2106 and is otherwise safe.

However, the get_seconds() function is generally considered deprecated
since the behavior is different between 32-bit and 64-bit architectures,
and using it may indicate a bigger problem.

To annotate that we've thought about this, let's add a comment here
and migrate to the ktime_get_real_seconds() function that consistently
returns a 64-bit number.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 7cedb4295e9d3..802ab9f7a8c11 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev)
 		break;
 
 	case FIT_MTD_CMD_LOG_HOST_ID:
-		skdev->connect_time_stamp = get_seconds();
+		/* hardware interface overflows in y2106 */
+		skdev->connect_time_stamp = (u32)ktime_get_real_seconds();
 		data = skdev->connect_time_stamp & 0xFFFF;
 		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
 		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-- 
cgit v1.2.3


From a806c6c81e6c0d07c8a8b2643bad4a37a196d681 Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Thu, 2 Nov 2017 18:07:44 +0900
Subject: nvme: comment typo fixed in clearing AER

a tiny typo fixed in a comment.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 07b9f4e1c283d..64744355aa88e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2637,7 +2637,7 @@ static void nvme_fw_act_work(struct work_struct *work)
 		return;
 
 	nvme_start_queues(ctrl);
-	/* read FW slot informationi to clear the AER*/
+	/* read FW slot information to clear the AER */
 	nvme_get_fw_slot_info(ctrl);
 }
 
-- 
cgit v1.2.3


From 8977f563845bdd61fc61c4dfb399270b7d5667c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:48 +0300
Subject: block: move REQ_NOWAIT

This flag should be before the operation-specific REQ_NOUNMAP bit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3385c89f402e2..c4c6c8bced2e1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -224,11 +224,11 @@ enum req_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
+	__REQ_NOWAIT,           /* Don't wait if request will block */
 
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
-	__REQ_NOWAIT,           /* Don't wait if request will block */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -245,9 +245,9 @@ enum req_flag_bits {
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
+#define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
-#define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-- 
cgit v1.2.3


From 96222bcc732d0504363dc772637c50e53b4bd41e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:49 +0300
Subject: block: add REQ_DRV bit

Set aside a bit in the request/bio flags for driver use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c4c6c8bced2e1..1b04085255fb3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -229,6 +229,9 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
+	/* for driver use */
+	__REQ_DRV,
+
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -249,6 +252,8 @@ enum req_flag_bits {
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 
+#define REQ_DRV			(1ULL << __REQ_DRV)
+
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
-- 
cgit v1.2.3


From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:50 +0300
Subject: block: provide a direct_make_request helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper allows reinserting a bio into a new queue without much
overhead, but requires all queue limits to be the same for the upper
and lower queues, and it does not provide any recursion preventions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 34 ++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index bb4fce694a60c..96aaa3a419a37 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2243,6 +2243,40 @@ out:
 }
 EXPORT_SYMBOL(generic_make_request);
 
+/**
+ * direct_make_request - hand a buffer directly to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * This function behaves like generic_make_request(), but does not protect
+ * against recursion.  Must only be used if the called driver is known
+ * to not call generic_make_request (or direct_make_request) again from
+ * its make_request function.  (Calling direct_make_request again from
+ * a workqueue is perfectly fine as that doesn't recurse).
+ */
+blk_qc_t direct_make_request(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	bool nowait = bio->bi_opf & REQ_NOWAIT;
+	blk_qc_t ret;
+
+	if (!generic_make_request_checks(bio))
+		return BLK_QC_T_NONE;
+
+	if (unlikely(blk_queue_enter(q, nowait))) {
+		if (nowait && !blk_queue_dying(q))
+			bio->bi_status = BLK_STS_AGAIN;
+		else
+			bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return BLK_QC_T_NONE;
+	}
+
+	ret = q->make_request_fn(q, bio);
+	blk_queue_exit(q);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(direct_make_request);
+
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @bio: The &struct bio which describes the I/O
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72637028f3c9f..eda3d25c0f689 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq)
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
+extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
-- 
cgit v1.2.3


From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:51 +0300
Subject: block: add a blk_steal_bios helper

This helpers allows to bounce steal the uncompleted bios from a request so
that they can be reissued on another path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 21 +++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 96aaa3a419a37..68cfe6780a9b7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2769,6 +2769,27 @@ struct request *blk_fetch_request(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_fetch_request);
 
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+	if (rq->bio) {
+		if (list->tail)
+			list->tail->bi_next = rq->bio;
+		else
+			list->head = rq->bio;
+		list->tail = rq->biotail;
+
+		rq->bio = NULL;
+		rq->biotail = NULL;
+	}
+
+	rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index eda3d25c0f689..fddda6a1f9b58 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q);
 extern void blk_start_request(struct request *rq);
 extern struct request *blk_fetch_request(struct request_queue *q);
 
+void blk_steal_bios(struct bio_list *list, struct request *rq);
+
 /*
  * Request completion related functions.
  *
-- 
cgit v1.2.3


From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:52 +0300
Subject: block: don't look at the struct device dev_t in disk_devt

The hidden gendisks introduced in the next patch need to keep the dev
field in their struct device empty so that udev won't try to create
block device nodes for them.  To support that rewrite disk_devt to
look at the major and first_minor fields in the gendisk itself instead
of looking into the struct device.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 4 ----
 include/linux/genhd.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index dd305c65ffb05..1174d24e405ee 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -649,10 +649,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		return;
 	}
 	disk_to_dev(disk)->devt = devt;
-
-	/* ->major and ->first_minor aren't supposed to be
-	 * dereferenced from here on, but set them just in case.
-	 */
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ea652bfcd6756..5c0ed5db33c2d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk)
 
 static inline dev_t disk_devt(struct gendisk *disk)
 {
-	return disk_to_dev(disk)->devt;
+	return MKDEV(disk->major, disk->first_minor);
 }
 
 static inline dev_t part_devt(struct hd_struct *part)
-- 
cgit v1.2.3


From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:53 +0300
Subject: block: introduce GENHD_FL_HIDDEN

With this flag a driver can create a gendisk that can be used for I/O
submission inside the kernel, but which is not registered as user
facing block device.  This will be useful for the NVMe multipath
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 68 +++++++++++++++++++++++++++++++++++++--------------
 include/linux/genhd.h |  1 +
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 1174d24e405ee..835e907e6e015 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -585,6 +585,11 @@ static void register_disk(struct device *parent, struct gendisk *disk)
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		dev_set_uevent_suppress(ddev, 0);
+		return;
+	}
+
 	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
@@ -616,6 +621,11 @@ exit:
 	while ((part = disk_part_iter_next(&piter)))
 		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
+
+	err = sysfs_create_link(&ddev->kobj,
+				&disk->queue->backing_dev_info->dev->kobj,
+				"bdi");
+	WARN_ON(err);
 }
 
 /**
@@ -630,7 +640,6 @@ exit:
  */
 void device_add_disk(struct device *parent, struct gendisk *disk)
 {
-	struct backing_dev_info *bdi;
 	dev_t devt;
 	int retval;
 
@@ -639,7 +648,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 * parameters make sense.
 	 */
 	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
-	WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+	WARN_ON(!disk->minors &&
+		!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
 
 	disk->flags |= GENHD_FL_UP;
 
@@ -648,18 +658,26 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 		WARN_ON(1);
 		return;
 	}
-	disk_to_dev(disk)->devt = devt;
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
 	disk_alloc_events(disk);
 
-	/* Register BDI before referencing it from bdev */
-	bdi = disk->queue->backing_dev_info;
-	bdi_register_owner(bdi, disk_to_dev(disk));
-
-	blk_register_region(disk_devt(disk), disk->minors, NULL,
-			    exact_match, exact_lock, disk);
+	if (disk->flags & GENHD_FL_HIDDEN) {
+		/*
+		 * Don't let hidden disks show up in /proc/partitions,
+		 * and don't bother scanning for partitions either.
+		 */
+		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
+		disk->flags |= GENHD_FL_NO_PART_SCAN;
+	} else {
+		/* Register BDI before referencing it from bdev */
+		disk_to_dev(disk)->devt = devt;
+		bdi_register_owner(disk->queue->backing_dev_info,
+				disk_to_dev(disk));
+		blk_register_region(disk_devt(disk), disk->minors, NULL,
+				    exact_match, exact_lock, disk);
+	}
 	register_disk(parent, disk);
 	blk_register_queue(disk);
 
@@ -669,10 +687,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	 */
 	WARN_ON_ONCE(!blk_get_queue(disk->queue));
 
-	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
-				   "bdi");
-	WARN_ON(retval);
-
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
@@ -701,7 +715,8 @@ void del_gendisk(struct gendisk *disk)
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 
-	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
+	if (!(disk->flags & GENHD_FL_HIDDEN))
+		sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	if (disk->queue) {
 		/*
 		 * Unregister bdi before releasing device numbers (as they can
@@ -712,13 +727,15 @@ void del_gendisk(struct gendisk *disk)
 	} else {
 		WARN_ON(1);
 	}
-	blk_unregister_region(disk_devt(disk), disk->minors);
+
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+		blk_unregister_region(disk_devt(disk), disk->minors);
+		kobject_put(disk->part0.holder_dir);
+		kobject_put(disk->slave_dir);
+	}
 
 	part_stat_set_all(&disk->part0, 0);
 	disk->part0.stamp = 0;
-
-	kobject_put(disk->part0.holder_dir);
-	kobject_put(disk->slave_dir);
 	if (!sysfs_deprecated)
 		sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 	pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
@@ -781,6 +798,10 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
+	if (unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+		put_disk(disk);
+		disk = NULL;
+	}
 	return disk;
 }
 EXPORT_SYMBOL(get_gendisk);
@@ -1024,6 +1045,15 @@ static ssize_t disk_removable_show(struct device *dev,
 		       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 }
 
+static ssize_t disk_hidden_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n",
+		       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
+}
+
 static ssize_t disk_ro_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1061,6 +1091,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
@@ -1085,6 +1116,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
 	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
+	&dev_attr_hidden.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5c0ed5db33c2d..93aae3476f58b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -140,6 +140,7 @@ struct hd_struct {
 #define GENHD_FL_NATIVE_CAPACITY		128
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE	256
 #define GENHD_FL_NO_PART_SCAN			512
+#define GENHD_FL_HIDDEN				1024
 
 enum {
 	DISK_EVENT_MEDIA_CHANGE			= 1 << 0, /* media changed */
-- 
cgit v1.2.3


From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:29:54 +0300
Subject: block: add a poll_fn callback to struct request_queue

That we we can also poll non blk-mq queues.  Mostly needed for
the NVMe multipath code, but could also be useful elsewhere.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c             | 11 +++++++++++
 block/blk-mq.c               | 14 +++++---------
 drivers/nvme/target/io-cmd.c |  2 +-
 fs/block_dev.c               |  4 ++--
 fs/direct-io.c               |  2 +-
 fs/iomap.c                   |  2 +-
 include/linux/blkdev.h       |  4 +++-
 mm/page_io.c                 |  2 +-
 8 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 68cfe6780a9b7..395bfb10d6581 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2321,6 +2321,17 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	if (!q->poll_fn || !blk_qc_t_valid(cookie))
+		return false;
+
+	if (current->plug)
+		blk_flush_plug_list(current->plug, false);
+	return q->poll_fn(q, cookie);
+}
+EXPORT_SYMBOL_GPL(blk_poll);
+
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for new the queue limits
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e4d2490f4e7ee..95ea5889b8252 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,6 +37,7 @@
 #include "blk-wbt.h"
 #include "blk-mq-sched.h"
 
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2499,6 +2500,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	spin_lock_init(&q->requeue_lock);
 
 	blk_queue_make_request(q, blk_mq_make_request);
+	if (q->mq_ops->poll)
+		q->poll_fn = blk_mq_poll;
 
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
@@ -2961,20 +2964,14 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return false;
 }
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct blk_plug *plug;
 	struct request *rq;
 
-	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
-	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return false;
 
-	plug = current->plug;
-	if (plug)
-		blk_flush_plug_list(plug, false);
-
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 	if (!blk_qc_t_is_internal(cookie))
 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
@@ -2992,7 +2989,6 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 	return __blk_mq_poll(hctx, rq);
 }
-EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 static int __init blk_mq_init(void)
 {
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0d4c23dc45324..db632818777dd 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	cookie = submit_bio(bio);
 
-	blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie);
+	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_execute_flush(struct nvmet_req *req)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 07ddccd178017..4afa4d5ff969e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -237,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -402,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 62cf812ed0e58..d2bc339cb1e98 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -486,7 +486,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index 8194d30bdca08..4241bac905b19 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1049,7 +1049,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 			if (!(iocb->ki_flags & IOCB_HIPRI) ||
 			    !dio->submit.last_queue ||
-			    !blk_mq_poll(dio->submit.last_queue,
+			    !blk_poll(dio->submit.last_queue,
 					 dio->submit.cookie))
 				io_schedule();
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fddda6a1f9b58..225617dd0a3f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -266,6 +266,7 @@ struct blk_queue_ctx;
 
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
+typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
@@ -408,6 +409,7 @@ struct request_queue {
 
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
+	poll_q_fn		*poll_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+bool blk_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index 21502d341a67c..ff04de630c465 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -407,7 +407,7 @@ int swap_readpage(struct page *page, bool do_poll)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!blk_mq_poll(disk->queue, qc))
+		if (!blk_poll(disk->queue, qc))
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From a116895fc7b61de4f26548c9b635a06515ec7563 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 3 Nov 2017 11:00:03 -0600
Subject: cdrom: hide CONFIG_CDROM menu selection

We don't need to expose this. The point is that drivers select
the uniform CDROM layer, if they need it, the user should not
have to make a conscious decision on whether to include this
separately or not.

Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b998b3c9ea860..95e678716b5e9 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -68,14 +68,7 @@ config AMIGA_Z2RAM
 	  module will be called z2ram.
 
 config CDROM
-	tristate "CD-ROM driver"
-	help
-	  A CD-ROM is a pre-pressed optical compact disc which contains
-	  data. The name is an acronym which stands for "Compact Disc
-	  Read-Only Memory". This driver implements functionality that is
-	  shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the
-	  SCSI CD-ROM driver. For a list of the ioctls implemented by this
-	  driver, see also Documentation/ioctl/cdrom.txt.
+	tristate
 
 config GDROM
 	tristate "SEGA Dreamcast GD-ROM drive"
-- 
cgit v1.2.3


From c2e82a234873d905260a3bd0aca9577e85619bda Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Fri, 22 Sep 2017 23:36:28 +0800
Subject: blk-mq: fix nr_requests wrong value when modify it from sysfs

if blk-mq use "none" io scheduler, nr_request get a wrong value when
input a number > tag_set->queue_depth. blk_mq_tag_update_depth will get
the smaller one min(nr, set->queue_depth), and then q->nr_request get a
wrong value.

Reproduce:

echo none > /sys/block/nvme0n1/queue/scheduler
echo 1000000 > /sys/block/nvme0n1/queue/nr_requests
cat /sys/block/nvme0n1/queue/nr_requests
1000000

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 95ea5889b8252..9eea67ce82d90 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2743,8 +2743,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		 * queue depth. This is similar to what the old code would do.
 		 */
 		if (!hctx->sched_tags) {
-			ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
-							min(nr, set->queue_depth),
+			ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
 							false);
 		} else {
 			ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
-- 
cgit v1.2.3


From 21e768b442bb587123ea924620e74d6e5655d717 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Mon, 16 Oct 2017 16:32:26 -0700
Subject: blk-mq: Make blk_mq_get_request() error path less confusing

blk_mq_get_tag() can modify data->ctx. This means that in the
error path of blk_mq_get_request() data->ctx should be passed to
blk_mq_put_ctx() instead of local_ctx. Note: since blk_mq_put_ctx()
ignores its argument, this patch does not change any functionality.

References: commit 1ad43c0078b7 ("blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9eea67ce82d90..13cdccef543ce 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -337,12 +337,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
-	struct blk_mq_ctx *local_ctx = NULL;
+	bool put_ctx_on_error = false;
 
 	blk_queue_enter_live(q);
 	data->q = q;
-	if (likely(!data->ctx))
-		data->ctx = local_ctx = blk_mq_get_ctx(q);
+	if (likely(!data->ctx)) {
+		data->ctx = blk_mq_get_ctx(q);
+		put_ctx_on_error = true;
+	}
 	if (likely(!data->hctx))
 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 	if (op & REQ_NOWAIT)
@@ -361,8 +363,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 	tag = blk_mq_get_tag(data);
 	if (tag == BLK_MQ_TAG_FAIL) {
-		if (local_ctx) {
-			blk_mq_put_ctx(local_ctx);
+		if (put_ctx_on_error) {
+			blk_mq_put_ctx(data->ctx);
 			data->ctx = NULL;
 		}
 		blk_queue_exit(q);
-- 
cgit v1.2.3


From e4f36b249b4d4e7599f1cf0c8fb50f196e52677e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 20 Oct 2017 16:45:23 +0200
Subject: block: fix peeking requests during PM

We need to look for an active PM request until the next softbarrier
instead of looking for the first non-PM request.  Otherwise any cause
of request reordering might starve the PM request(s).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 395bfb10d6581..223f32826e62d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2511,20 +2511,22 @@ void blk_account_io_done(struct request *req)
  * Don't process normal requests when queue is suspended
  * or in the process of suspending/resuming
  */
-static struct request *blk_pm_peek_request(struct request_queue *q,
-					   struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
 {
-	if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-	    (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
-		return NULL;
-	else
-		return rq;
+	switch (rq->q->rpm_status) {
+	case RPM_RESUMING:
+	case RPM_SUSPENDING:
+		return rq->rq_flags & RQF_PM;
+	case RPM_SUSPENDED:
+		return false;
+	}
+
+	return true;
 }
 #else
-static inline struct request *blk_pm_peek_request(struct request_queue *q,
-						  struct request *rq)
+static bool blk_pm_allow_request(struct request *rq)
 {
-	return rq;
+	return true;
 }
 #endif
 
@@ -2572,9 +2574,12 @@ static struct request *elv_next_request(struct request_queue *q)
 	WARN_ON_ONCE(q->mq_ops);
 
 	while (1) {
-		if (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			return rq;
+		list_for_each_entry(rq, &q->queue_head, queuelist) {
+			if (blk_pm_allow_request(rq))
+				return rq;
+
+			if (rq->rq_flags & RQF_SOFTBARRIER)
+				break;
 		}
 
 		/*
@@ -2625,10 +2630,6 @@ struct request *blk_peek_request(struct request_queue *q)
 	WARN_ON_ONCE(q->mq_ops);
 
 	while ((rq = elv_next_request(q)) != NULL) {
-		rq = blk_pm_peek_request(q, rq);
-		if (!rq)
-			break;
-
 		if (!(rq->rq_flags & RQF_STARTED)) {
 			/*
 			 * This is the first time the device driver
-- 
cgit v1.2.3


From 826a70a08b1210bbfdbda812ab43eb986e25b5c2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 4 Nov 2017 09:55:34 +0800
Subject: SCSI: don't get target/host busy_count in scsi_mq_get_budget()

It is very expensive to atomic_inc/atomic_dec the host wide counter of
host->busy_count, and it should have been avoided via blk-mq's mechanism
of getting driver tag, which uses the more efficient way of sbitmap queue.

Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget()
and don't run queue if the counter becomes zero, so IO hang may be caused
if all requests are completed just before the current SCSI device
is added to shost->starved_list.

Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
Reported-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 6f10afaca25b2..22a7e4c47207a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1950,11 +1950,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost = sdev->host;
 
-	atomic_dec(&shost->host_busy);
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
 	atomic_dec(&sdev->device_busy);
 	put_device(&sdev->sdev_gendev);
 }
@@ -1963,7 +1959,6 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost = sdev->host;
 	blk_status_t ret;
 
 	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
@@ -1974,18 +1969,9 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 		goto out;
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
-	if (!scsi_target_queue_ready(shost, sdev))
-		goto out_dec_device_busy;
-	if (!scsi_host_queue_ready(q, shost, sdev))
-		goto out_dec_target_busy;
 
 	return BLK_STS_OK;
 
-out_dec_target_busy:
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
-out_dec_device_busy:
-	atomic_dec(&sdev->device_busy);
 out_put_device:
 	put_device(&sdev->sdev_gendev);
 out:
@@ -1998,6 +1984,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *req = bd->rq;
 	struct request_queue *q = req->q;
 	struct scsi_device *sdev = q->queuedata;
+	struct Scsi_Host *shost = sdev->host;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
 	int reason;
@@ -2007,10 +1994,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_put_budget;
 
 	ret = BLK_STS_RESOURCE;
+	if (!scsi_target_queue_ready(shost, sdev))
+		goto out_put_budget;
+	if (!scsi_host_queue_ready(q, shost, sdev))
+		goto out_dec_target_busy;
+
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
 		if (ret != BLK_STS_OK)
-			goto out_put_budget;
+			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;
 	} else {
 		blk_mq_start_request(req);
@@ -2028,11 +2020,16 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (reason) {
 		scsi_set_blocked(cmd, reason);
 		ret = BLK_STS_RESOURCE;
-		goto out_put_budget;
+		goto out_dec_host_busy;
 	}
 
 	return BLK_STS_OK;
 
+out_dec_host_busy:
+       atomic_dec(&shost->host_busy);
+out_dec_target_busy:
+	if (scsi_target(sdev)->can_queue > 0)
+		atomic_dec(&scsi_target(sdev)->target_busy);
 out_put_budget:
 	scsi_mq_put_budget(hctx);
 	switch (ret) {
-- 
cgit v1.2.3


From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 5 Nov 2017 02:21:12 +0800
Subject: blk-mq: don't handle failure in .get_budget

It is enough to just check if we can get the budget via .get_budget().
And we don't need to deal with device state change in .get_budget().

For SCSI, one issue to be fixed is that we have to call
scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails
to handle the request. And it isn't enough to simply call
blk_mq_end_request() to do that if this request is marked as
RQF_DONTPREP.

Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c    | 14 ++------------
 block/blk-mq.c          | 17 ++++-------------
 block/blk-mq.h          |  5 ++---
 drivers/scsi/scsi_lib.c | 11 +++--------
 include/linux/blk-mq.h  |  2 +-
 5 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7775f6b12fa94..13a27d4d1671d 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,23 +94,18 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (e->type->ops.mq.has_work &&
 				!e->type->ops.mq.has_work(hctx))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = e->type->ops.mq.dispatch_request(hctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
@@ -146,22 +141,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
-		blk_status_t ret;
 
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
-		ret = blk_mq_get_dispatch_budget(hctx);
-		if (ret == BLK_STS_RESOURCE)
+		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
-		} else if (ret != BLK_STS_OK) {
-			blk_mq_end_request(rq, ret);
-			continue;
 		}
 
 		/*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13cdccef543ce..c9fa4b294664a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1137,13 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget) {
-			ret = blk_mq_get_dispatch_budget(hctx);
-			if (ret == BLK_STS_RESOURCE)
-				break;
-			if (ret != BLK_STS_OK)
-				goto fail_rq;
-		}
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+			break;
 
 		list_del_init(&rq->queuelist);
 
@@ -1170,7 +1165,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			break;
 		}
 
- fail_rq:
 		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
 			blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1642,12 +1636,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
-	ret = blk_mq_get_dispatch_budget(hctx);
-	if (ret == BLK_STS_RESOURCE) {
+	if (!blk_mq_get_dispatch_budget(hctx)) {
 		blk_mq_put_driver_tag(rq);
 		goto insert;
-	} else if (ret != BLK_STS_OK)
-		goto fail_rq;
+	}
 
 	new_cookie = request_to_qc_t(hctx, rq);
 
@@ -1665,7 +1657,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		__blk_mq_requeue_request(rq);
 		goto insert;
 	default:
- fail_rq:
 		*cookie = BLK_QC_T_NONE;
 		blk_mq_end_request(rq, ret);
 		return;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 522b420dedc07..f97aceff76e91 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -147,14 +147,13 @@ static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 		q->mq_ops->put_budget(hctx);
 }
 
-static inline blk_status_t blk_mq_get_dispatch_budget(
-		struct blk_mq_hw_ctx *hctx)
+static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 
 	if (q->mq_ops->get_budget)
 		return q->mq_ops->get_budget(hctx);
-	return BLK_STS_OK;
+	return true;
 }
 
 #endif
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 22a7e4c47207a..286ea983c9e35 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
 	put_device(&sdev->sdev_gendev);
 }
 
-static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
+static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct scsi_device *sdev = q->queuedata;
-	blk_status_t ret;
-
-	ret = prep_to_mq(scsi_prep_state_check(sdev, NULL));
-	if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK)
-		return ret;
 
 	if (!get_device(&sdev->sdev_gendev))
 		goto out;
 	if (!scsi_dev_queue_ready(q, sdev))
 		goto out_put_device;
 
-	return BLK_STS_OK;
+	return true;
 
 out_put_device:
 	put_device(&sdev->sdev_gendev);
 out:
-	return BLK_STS_RESOURCE;
+	return false;
 }
 
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f2e3079eecdd8..674641527da79 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -92,7 +92,7 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
-typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *);
+typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
-- 
cgit v1.2.3


From e84010732225c4c7c3464ee1169d395751c3adfa Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Tue, 17 Oct 2017 23:56:21 +0800
Subject: blkcg: add sanity check for blkcg policy operations

blkcg policy should keep cpd/pd's alloc_fn and free_fn in pairs,
otherwise policy would register fail.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7ec676043b1c..4117524ca45bc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (i >= BLKCG_MAX_POLS)
 		goto err_unlock;
 
+	/* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
+	if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+		(!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+		goto err_unlock;
+
 	/* register @pol */
 	pol->plid = i;
 	blkcg_policy[pol->plid] = pol;
-- 
cgit v1.2.3


From 6d6f167ce74158903e7fc20dfbecf89c71aa1c00 Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Thu, 2 Nov 2017 23:24:32 +0800
Subject: blk-mq: put the driver tag of nxt rq before first one is requeued

When freeing the driver tag of the next rq with an I/O scheduler
configured, we get the first entry of the list. However, this can
race with requeue of a request, and we end up getting the wrong request
from the head of the list. Free the driver tag of next rq before the
failed one is requeued in the failure branch of queue_rq callback.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c9fa4b294664a..6eacc1dea8b74 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1094,7 +1094,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
+	struct request *rq, *nxt;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1151,14 +1151,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (list_empty(list))
 			bd.last = true;
 		else {
-			struct request *nxt;
-
 			nxt = list_first_entry(list, struct request, queuelist);
 			bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		if (ret == BLK_STS_RESOURCE) {
+			/*
+			 * If an I/O scheduler has been configured and we got a
+			 * driver tag for the next request already, free it again.
+			 */
+			if (!list_empty(list)) {
+				nxt = list_first_entry(list, struct request, queuelist);
+				blk_mq_put_driver_tag(nxt);
+			}
 			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
@@ -1181,13 +1187,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(list)) {
-		/*
-		 * If an I/O scheduler has been configured and we got a driver
-		 * tag for the next request already, free it again.
-		 */
-		rq = list_first_entry(list, struct request, queuelist);
-		blk_mq_put_driver_tag(rq);
-
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
-- 
cgit v1.2.3


From 9c71c83c857e7a84a5be5a56ea88da7098f51db8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:33 +0800
Subject: blk-flush: don't run queue for requests bypassing flush

blk_insert_flush() should only insert request since run queue always
follows it.

In case of bypassing flush, we don't need to run queue because every
blk_insert_flush() follows one run queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4938bec8cfef9..81bd1a8430431 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		if (q->mq_ops)
-			blk_mq_sched_insert_request(rq, false, true, false, false);
+			blk_mq_sched_insert_request(rq, false, false, false, false);
 		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
-- 
cgit v1.2.3


From b0850297c749ea79a5717d597931366b3d7f4b09 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:34 +0800
Subject: block: pass 'run_queue' to blk_mq_request_bypass_insert

Block flush need this function without running the queue, so add a
parameter controlling whether we run it or not.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +++--
 block/blk-mq.h   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 223f32826e62d..b8d1aa2d1008f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2397,7 +2397,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		 * bypass a potential scheduler on the bottom device for
 		 * insert.
 		 */
-		blk_mq_request_bypass_insert(rq);
+		blk_mq_request_bypass_insert(rq, true);
 		return BLK_STS_OK;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6eacc1dea8b74..021562bd5d2c3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1492,7 +1492,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  * Should only be used carefully, when the caller knows we want to
  * bypass a potential IO scheduler on the target device.
  */
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1501,7 +1501,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
 	list_add_tail(&rq->queuelist, &hctx->dispatch);
 	spin_unlock(&hctx->lock);
 
-	blk_mq_run_hw_queue(hctx, false);
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f97aceff76e91..9fffec0ad8e9d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -56,7 +56,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq);
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
-- 
cgit v1.2.3


From 598906f814280762157629ba8833bf5cb11def74 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:35 +0800
Subject: blk-flush: use blk_mq_request_bypass_insert()

In the following patch, we will use RQF_FLUSH_SEQ to decide:

1) if the flag isn't set, the flush rq need to be inserted via
blk_insert_flush()

2) otherwise, the flush rq need to be dispatched directly since
it is in flush machinery now.

So we use blk_mq_request_bypass_insert() for requests of bypassing
flush machinery, just like the legacy path did.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 81bd1a8430431..a9773d2075ac6 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		if (q->mq_ops)
-			blk_mq_sched_insert_request(rq, false, false, false, false);
+			blk_mq_request_bypass_insert(rq, false);
 		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
-- 
cgit v1.2.3


From a6a252e6491443c1c18eab7e254daee63d4a7a04 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:36 +0800
Subject: blk-mq-sched: decide how to handle flush rq via RQF_FLUSH_SEQ

In case of IO scheduler we always pre-allocate one driver tag before
calling blk_insert_flush(), and flush request will be marked as
RQF_FLUSH_SEQ once it is in flush machinery.

So if RQF_FLUSH_SEQ isn't set, we call blk_insert_flush() to handle
the request, otherwise the flush request is dispatched to ->dispatch
list directly.

This is a preparation patch for not preallocating a driver tag for flush
requests, and for not treating flush requests as a special case. This is
similar to what the legacy path does.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 13a27d4d1671d..e7094f44afafc 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -345,21 +345,23 @@ void blk_mq_sched_request_inserted(struct request *rq)
 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 
 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+				       bool has_sched,
 				       struct request *rq)
 {
-	if (rq->tag == -1) {
+	/* dispatch flush rq directly */
+	if (rq->rq_flags & RQF_FLUSH_SEQ) {
+		spin_lock(&hctx->lock);
+		list_add(&rq->queuelist, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+		return true;
+	}
+
+	if (has_sched) {
 		rq->rq_flags |= RQF_SORTED;
-		return false;
+		WARN_ON(rq->tag != -1);
 	}
 
-	/*
-	 * If we already have a real request tag, send directly to
-	 * the dispatch list.
-	 */
-	spin_lock(&hctx->lock);
-	list_add(&rq->queuelist, &hctx->dispatch);
-	spin_unlock(&hctx->lock);
-	return true;
+	return false;
 }
 
 /*
@@ -385,12 +387,13 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
-	if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+	/* flush rq in flush machinery need to be dispatched directly */
+	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
 		blk_mq_sched_insert_flush(hctx, rq, can_block);
 		return;
 	}
 
-	if (e && blk_mq_sched_bypass_insert(hctx, rq))
+	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;
 
 	if (e && e->type->ops.mq.insert_requests) {
@@ -428,7 +431,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 		list_for_each_entry_safe(rq, next, list, queuelist) {
 			if (WARN_ON_ONCE(rq->tag != -1)) {
 				list_del_init(&rq->queuelist);
-				blk_mq_sched_bypass_insert(hctx, rq);
+				blk_mq_sched_bypass_insert(hctx, true, rq);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 244c65a3ccaa06fd15cc940315606674d3108b2f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sat, 4 Nov 2017 12:39:57 -0600
Subject: blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h

We need this helper to put the driver tag for flush rq, since we will
not share tag in the flush request sequence in the following patch
in case that I/O scheduler is applied.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 32 --------------------------------
 block/blk-mq.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 021562bd5d2c3..14f6886fbec80 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -996,38 +996,6 @@ done:
 	return rq->tag != -1;
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq)
-{
-	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-	rq->tag = -1;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
-	}
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-				       struct request *rq)
-{
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9fffec0ad8e9d..2502f40ccdc07 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -2,6 +2,7 @@
 #define INT_BLK_MQ_H
 
 #include "blk-stat.h"
+#include "blk-mq-tag.h"
 
 struct blk_mq_tag_set;
 
@@ -156,4 +157,36 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 	return true;
 }
 
+static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+					   struct request *rq)
+{
+	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+	rq->tag = -1;
+
+	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+		atomic_dec(&hctx->nr_active);
+	}
+}
+
+static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	if (rq->tag == -1 || rq->internal_tag == -1)
+		return;
+
+	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+	__blk_mq_put_driver_tag(hctx, rq);
+}
+
 #endif
-- 
cgit v1.2.3


From 923218f6166a84688973acdc39094f3bee1e9ad4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 2 Nov 2017 23:24:38 +0800
Subject: blk-mq: don't allocate driver tag upfront for flush rq

The idea behind it is simple:

1) for none scheduler, driver tag has to be borrowed for flush rq,
   otherwise we may run out of tag, and that causes an IO hang. And
   get/put driver tag is actually noop for none, so reordering tags
   isn't necessary at all.

2) for a real I/O scheduler, we need not allocate a driver tag upfront
   for flush rq. It works just fine to follow the same approach as
   normal requests: allocate driver tag for each rq just before calling
   ->queue_rq().

One driver visible change is that the driver tag isn't shared in the
flush request sequence. That won't be a problem, since we always do that
in legacy path.

Then flush rq need not be treated specially wrt. get/put driver tag.
This cleans up the code - for instance, reorder_tags_to_front() can be
removed, and we needn't worry about request ordering in dispatch list
for avoiding I/O deadlock.

Also we have to put the driver tag before requeueing.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c    | 35 ++++++++++++++++++++++++++---------
 block/blk-mq-sched.c | 42 +++++-------------------------------------
 block/blk-mq.c       | 41 ++++++-----------------------------------
 3 files changed, 37 insertions(+), 81 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index a9773d2075ac6..f17170675917b 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		/* release the tag's ownership to the req cloned from */
 		spin_lock_irqsave(&fq->mq_flush_lock, flags);
 		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-		flush_rq->tag = -1;
+		if (!q->elevator) {
+			blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+			flush_rq->tag = -1;
+		} else {
+			blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+			flush_rq->internal_tag = -1;
+		}
 	}
 
 	running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	blk_rq_init(q, flush_rq);
 
 	/*
-	 * Borrow tag from the first request since they can't
-	 * be in flight at the same time. And acquire the tag's
-	 * ownership for flush req.
+	 * In case of none scheduler, borrow tag from the first request
+	 * since they can't be in flight at the same time. And acquire
+	 * the tag's ownership for flush req.
+	 *
+	 * In case of IO scheduler, flush rq need to borrow scheduler tag
+	 * just for cheating put/get driver tag.
 	 */
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
 
 		flush_rq->mq_ctx = first_rq->mq_ctx;
-		flush_rq->tag = first_rq->tag;
-		fq->orig_rq = first_rq;
 
-		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		if (!q->elevator) {
+			fq->orig_rq = first_rq;
+			flush_rq->tag = first_rq->tag;
+			hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+			blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		} else {
+			flush_rq->internal_tag = first_rq->internal_tag;
+		}
 	}
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 
+	if (q->elevator) {
+		WARN_ON(rq->tag < 0);
+		blk_mq_put_driver_tag_hctx(hctx, rq);
+	}
+
 	/*
 	 * After populating an empty queue, kick it to avoid stall.  Read
 	 * the comment in flush_end_io().
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e7094f44afafc..01a43fed6b8ce 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 		return true;
 	}
 
-	if (has_sched) {
+	if (has_sched)
 		rq->rq_flags |= RQF_SORTED;
-		WARN_ON(rq->tag != -1);
-	}
 
 	return false;
 }
 
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
-				      struct request *rq, bool can_block)
-{
-	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
-		blk_insert_flush(rq);
-		blk_mq_run_hw_queue(hctx, true);
-	} else
-		blk_mq_add_to_requeue_list(rq, false, true);
-}
-
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block)
 {
@@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 
 	/* flush rq in flush machinery need to be dispatched directly */
 	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
-		blk_mq_sched_insert_flush(hctx, rq, can_block);
-		return;
+		blk_insert_flush(rq);
+		goto run;
 	}
 
+	WARN_ON(e && (rq->tag != -1));
+
 	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;
 
@@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	struct elevator_queue *e = hctx->queue->elevator;
 
-	if (e) {
-		struct request *rq, *next;
-
-		/*
-		 * We bypass requests that already have a driver tag assigned,
-		 * which should only be flushes. Flushes are only ever inserted
-		 * as single requests, so we shouldn't ever hit the
-		 * WARN_ON_ONCE() below (but let's handle it just in case).
-		 */
-		list_for_each_entry_safe(rq, next, list, queuelist) {
-			if (WARN_ON_ONCE(rq->tag != -1)) {
-				list_del_init(&rq->queuelist);
-				blk_mq_sched_bypass_insert(hctx, true, rq);
-			}
-		}
-	}
-
 	if (e && e->type->ops.mq.insert_requests)
 		e->type->ops.mq.insert_requests(hctx, list, false);
 	else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 14f6886fbec80..c501cbd0de93f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_put_driver_tag(rq);
+
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
 	blk_mq_sched_requeue_request(rq);
@@ -996,30 +998,6 @@ done:
 	return rq->tag != -1;
 }
 
-/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
- */
-static bool reorder_tags_to_front(struct list_head *list)
-{
-	struct request *rq, *tmp, *first = NULL;
-
-	list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
-		if (rq == first)
-			break;
-		if (rq->tag != -1) {
-			list_move(&rq->queuelist, list);
-			if (!first)
-				first = rq;
-		}
-	}
-
-	return first != NULL;
-}
-
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
@@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 
 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
-			if (!queued && reorder_tags_to_front(list))
-				continue;
-
 			/*
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed.
@@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 				nxt = list_first_entry(list, struct request, queuelist);
 				blk_mq_put_driver_tag(nxt);
 			}
-			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (unlikely(is_flush_fua)) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		if (q->elevator) {
-			blk_mq_sched_insert_request(rq, false, true, true,
-					true);
-		} else {
-			blk_insert_flush(rq);
-			blk_mq_run_hw_queue(data.hctx, true);
-		}
+
+		/* bypass scheduler for flush rq */
+		blk_insert_flush(rq);
+		blk_mq_run_hw_queue(data.hctx, true);
 	} else if (plug && q->nr_hw_queues == 1) {
 		struct request *last = NULL;
 
-- 
cgit v1.2.3


From ff57dc94faec023abc267cdc45766fccff497557 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 6 Nov 2017 16:11:57 -0500
Subject: nbd: wait uninterruptible for the dead timeout

If we have a pending signal or the user kills their application then
it'll bring down the whole device, which is less than awesome.  Instead
wait uninterruptible for the dead timeout so we're sure we gave it our
best shot.

Fixes: 560bc4b39952 ("nbd: handle dead connections")
Cc: stable@vger.kernel.org
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3684e21d543f2..b94a735bfe3c0 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -715,9 +715,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
 		return 0;
 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 		return 0;
-	wait_event_interruptible_timeout(config->conn_wait,
-					 atomic_read(&config->live_connections),
-					 config->dead_conn_timeout);
+	wait_event_timeout(config->conn_wait,
+			   atomic_read(&config->live_connections),
+			   config->dead_conn_timeout);
 	return atomic_read(&config->live_connections);
 }
 
-- 
cgit v1.2.3


From 6a468d5990ecd1c2d07dd85f8633bbdd0ba61c40 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 6 Nov 2017 16:11:58 -0500
Subject: nbd: don't start req until after the dead connection logic

We can end up sleeping for a while waiting for the dead timeout, which
means we could get the per request timer to fire.  We did handle this
case, but if the dead timeout happened right after we submitted we'd
either tear down the connection or possibly requeue as we're handling an
error and race with the endio which can lead to panics and other
hilarity.

Fixes: 560bc4b39952 ("nbd: handle dead connections")
Cc: stable@vger.kernel.org
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index b94a735bfe3c0..95cab69d9c8be 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -289,15 +289,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 		cmd->status = BLK_STS_TIMEOUT;
 		return BLK_EH_HANDLED;
 	}
-
-	/* If we are waiting on our dead timer then we could get timeout
-	 * callbacks for our request.  For this we just want to reset the timer
-	 * and let the queue side take care of everything.
-	 */
-	if (!completion_done(&cmd->send_complete)) {
-		nbd_config_put(nbd);
-		return BLK_EH_RESET_TIMER;
-	}
 	config = nbd->config;
 
 	if (config->num_connections > 1) {
@@ -732,6 +723,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Socks array is empty\n");
+		blk_mq_start_request(req);
 		return -EINVAL;
 	}
 	config = nbd->config;
@@ -740,6 +732,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
 				    "Attempted send on invalid socket\n");
 		nbd_config_put(nbd);
+		blk_mq_start_request(req);
 		return -EINVAL;
 	}
 	cmd->status = BLK_STS_OK;
@@ -763,6 +756,7 @@ again:
 			 */
 			sock_shutdown(nbd);
 			nbd_config_put(nbd);
+			blk_mq_start_request(req);
 			return -EIO;
 		}
 		goto again;
@@ -773,6 +767,7 @@ again:
 	 * here so that it gets put _after_ the request that is already on the
 	 * dispatch list.
 	 */
+	blk_mq_start_request(req);
 	if (unlikely(nsock->pending && nsock->pending != req)) {
 		blk_mq_requeue_request(req, true);
 		ret = 0;
@@ -785,10 +780,10 @@ again:
 	ret = nbd_send_cmd(nbd, cmd, index);
 	if (ret == -EAGAIN) {
 		dev_err_ratelimited(disk_to_dev(nbd->disk),
-				    "Request send failed trying another connection\n");
+				    "Request send failed, requeueing\n");
 		nbd_mark_nsock_dead(nbd, nsock, 1);
-		mutex_unlock(&nsock->tx_lock);
-		goto again;
+		blk_mq_requeue_request(req, true);
+		ret = 0;
 	}
 out:
 	mutex_unlock(&nsock->tx_lock);
@@ -812,7 +807,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * done sending everything over the wire.
 	 */
 	init_completion(&cmd->send_complete);
-	blk_mq_start_request(bd->rq);
 
 	/* We can be called directly from the user space process, which means we
 	 * could possibly have signals pending so our sendmsg will fail.  In
-- 
cgit v1.2.3


From e8815241173ebba4ccad490d4907b354b795fa5c Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Tue, 7 Nov 2017 22:23:01 +0900
Subject: null_blk: fix default values in documentation

null_blk.c has initial value of
(1) nr_devices as 1.
(2) completion_nsec as 10,000ns, not 10.000ns.
documentation should be updated for fixes above.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/null_blk.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index b7161c8e70f2e..fcf7f8487d8ee 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -38,7 +38,7 @@ gb=[Size in GB]: Default: 250GB
 bs=[Block size (in bytes)]: Default: 512 bytes
   The block size reported to the system.
 
-nr_devices=[Number of devices]: Default: 2
+nr_devices=[Number of devices]: Default: 1
   Number of block devices instantiated. They are instantiated as /dev/nullb0,
   etc.
 
@@ -52,7 +52,7 @@ irqmode=[0-2]: Default: 1-Soft-irq
   2: Timer: Waits a specific period (completion_nsec) for each IO before
      completion.
 
-completion_nsec=[ns]: Default: 10.000ns
+completion_nsec=[ns]: Default: 10,000ns
   Combined with irqmode=2 (timer). The time each completion event must wait.
 
 submit_queues=[1..nr_cpus]:
-- 
cgit v1.2.3


From bf9fc98b736b8a3837040d5ce3a7caf04fa0859c Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Tue, 7 Nov 2017 09:25:37 -0700
Subject: null_blk: add an usage for shared tags in documentation

Add usage explanation for a shared_tags, introduced by commit:

82f402fefa50 ("null_blk: add support for shared tags")

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>

Reworded slightly.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/null_blk.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index fcf7f8487d8ee..733927a7b501a 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -77,3 +77,8 @@ use_lightnvm=[0/1]: Default: 0
 no_sched=[0/1]: Default: 0
   0: nullb* use default blk-mq io scheduler.
   1: nullb* doesn't use io scheduler.
+
+shared_tags=[0/1]: Default: 0
+  0: Tag set is not shared.
+  1: Tag set shared between devices for blk-mq. Only makes sense with
+     nr_devices > 1, otherwise there's no tag set to share.
-- 
cgit v1.2.3


From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:13:48 -0700
Subject: blktrace: fix unlocked access to init/start-stop/teardown

sg.c calls into the blktrace functions without holding the proper queue
mutex for doing setup, start/stop, or teardown.

Add internal unlocked variants, and export the ones that do the proper
locking.

Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices")
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce5..ea57dd94b2b26 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 		blk_unregister_tracepoints();
 }
 
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
@@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q)
 
 	return 0;
 }
+
+int blk_trace_remove(struct request_queue *q)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_remove(q);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -550,9 +561,8 @@ err:
 	return ret;
 }
 
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    struct block_device *bdev,
-		    char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			     struct block_device *bdev, char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	int ret;
@@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 	return 0;
 }
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
+		    char __user *arg)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_setup(q, name, dev, bdev, arg);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 }
 #endif
 
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 	struct blk_trace *bt = q->blk_trace;
@@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
 
 	return ret;
 }
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_startstop(q, start);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
 /*
@@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
@@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
+		ret = __blk_trace_startstop(q, start);
 		break;
 	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
+		ret = __blk_trace_remove(q);
 		break;
 	default:
 		ret = -ENOTTY;
@@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
+	mutex_lock(&q->blk_trace_mutex);
+
 	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
+		__blk_trace_startstop(q, 0);
+		__blk_trace_remove(q);
 	}
+
+	mutex_unlock(&q->blk_trace_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:16:09 -0700
Subject: blktrace: fix unlocked registration of tracepoints

We need to ensure that tracepoints are registered and unregistered
with the users of them. The existing atomic count isn't enough for
that. Add a lock around the tracepoints, so we serialize access
to them.

This fixes cases where we have multiple users setting up and
tearing down tracepoints, like this:

CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted
4.14.0-rc5-next-20171018+ #36
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:16 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:52
  panic+0x1e4/0x41c kernel/panic.c:183
  __warn+0x1c4/0x1e0 kernel/panic.c:546
  report_bug+0x211/0x2d0 lib/bug.c:183
  fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177
  do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
  do_trap+0x260/0x390 arch/x86/kernel/traps.c:260
  do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297
  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
  invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline]
RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283
RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293
RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07
RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818
RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004
R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0
R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0
  tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304
  register_trace_block_rq_insert include/trace/events/block.h:191 [inline]
  blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043
  do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542
  blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564
  sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089
  vfs_ioctl fs/ioctl.c:45 [inline]
  do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
  SYSC_ioctl fs/ioctl.c:700 [inline]
  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
  entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x444339
RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339
RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009
RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff
R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000

since we can now run these in parallel. Ensure that the exported helpers
for doing this are grabbing the queue trace mutex.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea57dd94b2b26..206e0e2ace53d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
 
 static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
@@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt)
 	kfree(bt);
 }
 
+static void get_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (++blk_probes_ref == 1)
+		blk_register_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (!--blk_probes_ref)
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	blk_trace_free(bt);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
+	put_probe_ref();
 }
 
 static int __blk_trace_remove(struct request_queue *q)
@@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto err;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 
 	ret = 0;
 err:
@@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-
+	put_probe_ref();
 	blk_trace_free(bt);
 	return 0;
 }
@@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto free_bt;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 	return 0;
 
 free_bt:
-- 
cgit v1.2.3


From e54b064cb24c8268252336ccdd6523e08c0dcbde Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:51 +0300
Subject: nvme: move the dying queue check from cancel to completion

With multipath we don't want a hard DNR bit on a request that is cancelled
by a controller reset, but instead want to be able to retry it on another
patch.  To archive this don't always set the DNR bit when the queue is
dying in nvme_cancel_request, but defer that decision to
nvme_req_needs_retry.  Note that it applies to any command there and not
just cancelled commands, but one the queue is dying that is the right
thing to do anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 64744355aa88e..a9c7ad304ec5d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -172,6 +172,8 @@ static inline bool nvme_req_needs_retry(struct request *req)
 		return false;
 	if (nvme_req(req)->retries >= nvme_max_retries)
 		return false;
+	if (blk_queue_dying(req->q))
+		return false;
 	return true;
 }
 
@@ -189,18 +191,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
-	int status;
-
 	if (!blk_mq_request_started(req))
 		return;
 
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 				"Cancelling I/O %d", req->tag);
 
-	status = NVME_SC_ABORT_REQ;
-	if (blk_queue_dying(req->q))
-		status |= NVME_SC_DNR;
-	nvme_req(req)->status = status;
+	nvme_req(req)->status = NVME_SC_ABORT_REQ;
 	blk_mq_complete_request(req);
 
 }
-- 
cgit v1.2.3


From b5be3b392998a5f3763dfbdb1866f81d8ee631c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:52 +0300
Subject: nvme: always unregister the integrity profile in
 __nvme_revalidate_disk

This is safe because the queue is always frozen when we revalidate, and
it simplifies both the existing code as well as the multipath
implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a9c7ad304ec5d..da5357c66a0f2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1081,29 +1081,6 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
-		u16 bs)
-{
-	struct nvme_ns *ns = disk->private_data;
-	u16 old_ms = ns->ms;
-	u8 pi_type = 0;
-
-	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
-	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-
-	/* PI implementation requires metadata equal t10 pi tuple size */
-	if (ns->ms == sizeof(struct t10_pi_tuple))
-		pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-
-	if (blk_get_integrity(disk) &&
-	    (ns->pi_type != pi_type || ns->ms != old_ms ||
-	     bs != queue_logical_block_size(disk->queue) ||
-	     (ns->ms && ns->ext)))
-		blk_integrity_unregister(disk);
-
-	ns->pi_type = pi_type;
-}
-
 static void nvme_init_integrity(struct nvme_ns *ns)
 {
 	struct blk_integrity integrity;
@@ -1130,10 +1107,6 @@ static void nvme_init_integrity(struct nvme_ns *ns)
 	blk_queue_max_integrity_segments(ns->queue, 1);
 }
 #else
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
-		u16 bs)
-{
-}
 static void nvme_init_integrity(struct nvme_ns *ns)
 {
 }
@@ -1202,15 +1175,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 		ns->lba_shift = 9;
 	bs = 1 << ns->lba_shift;
 	ns->noiob = le16_to_cpu(id->noiob);
+	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	/* the PI implementation requires metadata equal t10 pi tuple size */
+	if (ns->ms == sizeof(struct t10_pi_tuple))
+		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+	else
+		ns->pi_type = 0;
 
 	blk_mq_freeze_queue(disk->queue);
+	blk_integrity_unregister(disk);
 
-	if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
-		nvme_prep_integrity(disk, id, bs);
 	blk_queue_logical_block_size(ns->queue, bs);
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
-	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
+	if (ns->ms && !ns->ext &&
+	    (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
 		nvme_init_integrity(ns);
 	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
 		set_capacity(disk, 0);
-- 
cgit v1.2.3


From 39b7baa410fd00a0fe9da864400d95b3d5008de7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:53 +0300
Subject: nvme: don't pass struct nvme_ns to nvme_init_integrity

To allow reusing this function for the multipath node.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index da5357c66a0f2..88d886b390f41 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1081,12 +1081,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
 {
 	struct blk_integrity integrity;
 
 	memset(&integrity, 0, sizeof(integrity));
-	switch (ns->pi_type) {
+	switch (pi_type) {
 	case NVME_NS_DPS_PI_TYPE3:
 		integrity.profile = &t10_pi_type3_crc;
 		integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1102,12 +1102,12 @@ static void nvme_init_integrity(struct nvme_ns *ns)
 		integrity.profile = NULL;
 		break;
 	}
-	integrity.tuple_size = ns->ms;
-	blk_integrity_register(ns->disk, &integrity);
-	blk_queue_max_integrity_segments(ns->queue, 1);
+	integrity.tuple_size = ms;
+	blk_integrity_register(disk, &integrity);
+	blk_queue_max_integrity_segments(disk->queue, 1);
 }
 #else
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
 {
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -1191,7 +1191,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 		nvme_set_chunk_size(ns);
 	if (ns->ms && !ns->ext &&
 	    (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
-		nvme_init_integrity(ns);
+		nvme_init_integrity(disk, ns->ms, ns->pi_type);
 	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
 		set_capacity(disk, 0);
 	else
-- 
cgit v1.2.3


From 30e5e929c7bfaf87a65f926715773ef14339b79f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:54 +0300
Subject: nvme: don't pass struct nvme_ns to nvme_config_discard

To allow reusing this function for the multipath node.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88d886b390f41..551ec5df6a97f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1118,29 +1118,26 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
 	blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
 }
 
-static void nvme_config_discard(struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ctrl *ctrl,
+		unsigned stream_alignment, struct request_queue *queue)
 {
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	u32 size = queue_logical_block_size(queue);
+
+	if (stream_alignment)
+		size *= stream_alignment;
 
 	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
 			NVME_DSM_MAX_RANGES);
 
-	if (ctrl->nr_streams && ns->sws && ns->sgs) {
-		unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+	queue->limits.discard_alignment = size;
+	queue->limits.discard_granularity = size;
 
-		ns->queue->limits.discard_alignment = sz;
-		ns->queue->limits.discard_granularity = sz;
-	} else {
-		ns->queue->limits.discard_alignment = logical_block_size;
-		ns->queue->limits.discard_granularity = logical_block_size;
-	}
-	blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
-	blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+	blk_queue_max_discard_sectors(queue, UINT_MAX);
+	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
 
 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
-		blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
+		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
 }
 
 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
@@ -1164,6 +1161,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 {
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
+	unsigned stream_alignment = 0;
 	u16 bs;
 
 	/*
@@ -1183,6 +1181,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	else
 		ns->pi_type = 0;
 
+	if (ctrl->nr_streams && ns->sws && ns->sgs)
+		stream_alignment = ns->sws * ns->sgs;
+
 	blk_mq_freeze_queue(disk->queue);
 	blk_integrity_unregister(disk);
 
@@ -1198,7 +1199,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 
 	if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
-		nvme_config_discard(ns);
+		nvme_config_discard(ctrl, stream_alignment, disk->queue);
 	blk_mq_unfreeze_queue(disk->queue);
 }
 
-- 
cgit v1.2.3


From 6e78f21ae4488cdab4318f781de05c8ffc4de951 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:55 +0300
Subject: nvme: set the chunk size before freezing the queue

We don't need a frozen queue to update the chunk_size, which just is a
hint, and moving it a little earlier will allow for some better code
reuse with the multipath code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 551ec5df6a97f..a4be843334baf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1184,12 +1184,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ctrl->nr_streams && ns->sws && ns->sgs)
 		stream_alignment = ns->sws * ns->sgs;
 
+	if (ns->noiob)
+		nvme_set_chunk_size(ns);
+
 	blk_mq_freeze_queue(disk->queue);
 	blk_integrity_unregister(disk);
 
 	blk_queue_logical_block_size(ns->queue, bs);
-	if (ns->noiob)
-		nvme_set_chunk_size(ns);
 	if (ns->ms && !ns->ext &&
 	    (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
 		nvme_init_integrity(disk, ns->ms, ns->pi_type);
-- 
cgit v1.2.3


From 24b0b58c5b4a9f6f4bbd1c1efd5afb9a565da3ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 21:28:56 +0300
Subject: nvme: split __nvme_revalidate_disk

Split out the code that applies the calculate value to a given disk/queue
into new helper that can be reused by the multipath code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 49 +++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a4be843334baf..28d58d353a420 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1157,12 +1157,34 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
 	}
 }
 
+static void nvme_update_disk_info(struct gendisk *disk,
+		struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+	unsigned stream_alignment = 0;
+
+	if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
+		stream_alignment = ns->sws * ns->sgs;
+
+	blk_mq_freeze_queue(disk->queue);
+	blk_integrity_unregister(disk);
+
+	blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+	if (ns->ms && !ns->ext &&
+	    (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+		nvme_init_integrity(disk, ns->ms, ns->pi_type);
+	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+		capacity = 0;
+	set_capacity(disk, capacity);
+
+	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+	blk_mq_unfreeze_queue(disk->queue);
+}
+
 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 {
 	struct nvme_ns *ns = disk->private_data;
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	unsigned stream_alignment = 0;
-	u16 bs;
 
 	/*
 	 * If identify namespace failed, use default 512 byte block size so
@@ -1171,7 +1193,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
 	if (ns->lba_shift == 0)
 		ns->lba_shift = 9;
-	bs = 1 << ns->lba_shift;
 	ns->noiob = le16_to_cpu(id->noiob);
 	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
 	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
@@ -1181,27 +1202,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	else
 		ns->pi_type = 0;
 
-	if (ctrl->nr_streams && ns->sws && ns->sgs)
-		stream_alignment = ns->sws * ns->sgs;
-
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
-
-	blk_mq_freeze_queue(disk->queue);
-	blk_integrity_unregister(disk);
-
-	blk_queue_logical_block_size(ns->queue, bs);
-	if (ns->ms && !ns->ext &&
-	    (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
-		nvme_init_integrity(disk, ns->ms, ns->pi_type);
-	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
-		set_capacity(disk, 0);
-	else
-		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
-	if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
-		nvme_config_discard(ctrl, stream_alignment, disk->queue);
-	blk_mq_unfreeze_queue(disk->queue);
+	nvme_update_disk_info(disk, ns, id);
 }
 
 static int nvme_revalidate_disk(struct gendisk *disk)
-- 
cgit v1.2.3


From 715ea9e09dc81f18cad5ed64800f49b1c4c444df Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 7 Nov 2017 17:27:34 +0100
Subject: nvme: fix and clarify the check for missing metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the check in nvme_setup_rw for missing metadata so that it is
together with the other metadata handling, does not contain impossible
to reach conditions and warns if we get an impossible requests for
a (non-PI) metadata-enabled namespace when CONFIG_BLK_DEV_INTEGRITY
is not set.

Also add a little helper that checks if a given metadata configuration
contains protection information

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Javier González <jg@lightnvm.io>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 28d58d353a420..3aabaa103e106 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -137,6 +137,11 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
 
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+	return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
 static blk_status_t nvme_error_status(struct request *req)
 {
 	switch (nvme_req(req)->status & 0x7ff) {
@@ -472,16 +477,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	u16 control = 0;
 	u32 dsmgmt = 0;
 
-	/*
-	 * If formated with metadata, require the block layer provide a buffer
-	 * unless this namespace is formated such that the metadata can be
-	 * stripped/generated by the controller with PRACT=1.
-	 */
-	if (ns && ns->ms &&
-	    (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
-	    !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
-		return BLK_STS_NOTSUPP;
-
 	if (req->cmd_flags & REQ_FUA)
 		control |= NVME_RW_FUA;
 	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
@@ -500,6 +495,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
 
 	if (ns->ms) {
+		/*
+		 * If formated with metadata, the block layer always provides a
+		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
+		 * we enable the PRACT bit for protection information or set the
+		 * namespace capacity to zero to prevent any I/O.
+		 */
+		if (!blk_integrity_rq(req)) {
+			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+				return BLK_STS_NOTSUPP;
+			control |= NVME_RW_PRINFO_PRACT;
+		}
+
 		switch (ns->pi_type) {
 		case NVME_NS_DPS_PI_TYPE3:
 			control |= NVME_RW_PRINFO_PRCHK_GUARD;
@@ -512,8 +519,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 					nvme_block_nr(ns, blk_rq_pos(req)));
 			break;
 		}
-		if (!blk_integrity_rq(req))
-			control |= NVME_RW_PRINFO_PRACT;
 	}
 
 	cmnd->rw.control = cpu_to_le16(control);
@@ -1173,7 +1178,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
 	if (ns->ms && !ns->ext &&
 	    (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
 		nvme_init_integrity(disk, ns->ms, ns->pi_type);
-	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+	if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
 		capacity = 0;
 	set_capacity(disk, capacity);
 
-- 
cgit v1.2.3


From c627c487ec727350c6aaa1cf217b666099e83ee2 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 10:28:31 -0700
Subject: nvme: factor get log into a helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And fix the warning on a successful firmware log.

Reviewed-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3aabaa103e106..90b6375a9da5b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1747,6 +1747,18 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
 }
 
+static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
+			size_t size)
+{
+	struct nvme_command c = { };
+
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
+	c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -2579,18 +2591,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
 
 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
 {
-	struct nvme_command c = { };
 	struct nvme_fw_slot_info_log *log;
 
 	log = kmalloc(sizeof(*log), GFP_KERNEL);
 	if (!log)
 		return;
 
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
-	c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
-
-	if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
+	if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
 		dev_warn(ctrl->device,
 				"Get FW SLOT INFO log error\n");
 	kfree(log);
-- 
cgit v1.2.3


From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 10:28:32 -0700
Subject: nvme: check admin passthru command effects

The NVMe standard provides a command effects log page so the host may
be aware of special requirements it may need to do for a particular
command. For example, the command may need to run with IO quiesced to
prevent timeouts or undefined behavior, or it may change the logical block
formats that determine how the host needs to construct future commands.

This patch saves the nvme command effects log page if the controller
supports it, and performs appropriate actions before and after an admin
passthrough command is completed. If the controller does not support the
command effects log page, the driver will define the effects for known
opcodes. The nvme format and santize are the only commands in this patch
with known effects.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |   1 +
 include/linux/nvme.h     |  18 ++++++++
 3 files changed, 126 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 90b6375a9da5b..65fd2fc1ae3c2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -72,6 +72,9 @@ static DEFINE_IDA(nvme_instance_ida);
 static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 
+static void nvme_ns_remove(struct nvme_ns *ns);
+static int nvme_revalidate_disk(struct gendisk *disk);
+
 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
 {
 	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
@@ -992,12 +995,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 			metadata, meta_len, io.slba, NULL, 0);
 }
 
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+	switch (opcode) {
+	case nvme_admin_format_nvm:
+		return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+					NVME_CMD_EFFECTS_CSE_MASK;
+	case nvme_admin_sanitize_nvm:
+		return NVME_CMD_EFFECTS_CSE_MASK;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+								u8 opcode)
+{
+	u32 effects = 0;
+
+	if (ns) {
+		if (ctrl->effects)
+			effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+		if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+			dev_warn(ctrl->device,
+				 "IO command:%02x has unhandled effects:%08x\n",
+				 opcode, effects);
+		return 0;
+	}
+
+	if (ctrl->effects)
+		effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+	else
+		effects = nvme_known_admin_effects(opcode);
+
+	/*
+	 * For simplicity, IO to all namespaces is quiesced even if the command
+	 * effects say only one namespace is affected.
+	 */
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+		nvme_start_freeze(ctrl);
+		nvme_wait_freeze(ctrl);
+	}
+	return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->disk && nvme_revalidate_disk(ns->disk))
+			nvme_ns_remove(ns);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+	/*
+	 * Revalidate LBA changes prior to unfreezing. This is necessary to
+	 * prevent memory corruption if a logical block size was changed by
+	 * this command.
+	 */
+	if (effects & NVME_CMD_EFFECTS_LBCC)
+		nvme_update_formats(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+		nvme_unfreeze(ctrl);
+	if (effects & NVME_CMD_EFFECTS_CCC)
+		nvme_init_identify(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+		nvme_queue_scan(ctrl);
+}
+
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			struct nvme_passthru_cmd __user *ucmd)
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
 	unsigned timeout = 0;
+	u32 effects;
 	int status;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1023,10 +1101,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
+	effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
 			(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
 			0, &cmd.result, timeout);
+	nvme_passthru_end(ctrl, effects);
+
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
 			return -EFAULT;
@@ -1759,6 +1840,25 @@ static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
 
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	if (!ctrl->effects)
+		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+
+	if (!ctrl->effects)
+		return 0;
+
+	ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
+					sizeof(*ctrl->effects));
+	if (ret) {
+		kfree(ctrl->effects);
+		ctrl->effects = NULL;
+	}
+	return ret;
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -1794,6 +1894,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		return -EIO;
 	}
 
+	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+		ret = nvme_get_effects_log(ctrl);
+		if (ret < 0)
+			return ret;
+	}
+
 	nvme_init_subnqn(ctrl, id);
 
 	if (!ctrl->identified) {
@@ -2713,6 +2819,7 @@ static void nvme_free_ctrl(struct device *dev)
 
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	ida_destroy(&ctrl->ns_ida);
+	kfree(ctrl->effects);
 
 	ctrl->ops->free_ctrl(ctrl);
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 35d9cee515f1b..f7c21cea94853 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -174,6 +174,7 @@ struct nvme_ctrl {
 	bool subsystem;
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
+	struct nvme_effects_log *effects;
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9310ce77d8e1f..fd1d4508a6129 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -267,6 +267,7 @@ enum {
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
 	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 8,
+	NVME_CTRL_LPA_CMD_EFFECTS_LOG		= 1 << 1,
 };
 
 struct nvme_lbaf {
@@ -395,6 +396,21 @@ struct nvme_fw_slot_info_log {
 	__u8			rsvd64[448];
 };
 
+enum {
+	NVME_CMD_EFFECTS_CSUPP		= 1 << 0,
+	NVME_CMD_EFFECTS_LBCC		= 1 << 1,
+	NVME_CMD_EFFECTS_NCC		= 1 << 2,
+	NVME_CMD_EFFECTS_NIC		= 1 << 3,
+	NVME_CMD_EFFECTS_CCC		= 1 << 4,
+	NVME_CMD_EFFECTS_CSE_MASK	= 3 << 16,
+};
+
+struct nvme_effects_log {
+	__le32 acs[256];
+	__le32 iocs[256];
+	__u8   resv[2048];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -681,6 +697,7 @@ enum nvme_admin_opcode {
 	nvme_admin_format_nvm		= 0x80,
 	nvme_admin_security_send	= 0x81,
 	nvme_admin_security_recv	= 0x82,
+	nvme_admin_sanitize_nvm		= 0x84,
 };
 
 enum {
@@ -712,6 +729,7 @@ enum {
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
+	NVME_LOG_CMD_EFFECTS	= 0x05,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
-- 
cgit v1.2.3


From c5760f300e25f7d2cea9a9002b1e058a5c18b73f Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 3 Nov 2017 08:13:16 -0700
Subject: nvme-fc: fix localport resume using stale values

The localport resume was not updating the lldd ops structure. If the
lldd is unloaded and reloaded, the ops pointers will differ.

Additionally, as there are device references taken by the localport,
ensure that resume only resumes if the device matches as well.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 113c30be7276b..c49971d1a0515 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -268,7 +268,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport)
 
 
 static struct nvme_fc_lport *
-nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
+nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *ops,
+			struct device *dev)
 {
 	struct nvme_fc_lport *lport;
 	unsigned long flags;
@@ -280,6 +282,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
 		    lport->localport.port_name != pinfo->port_name)
 			continue;
 
+		if (lport->dev != dev) {
+			lport = ERR_PTR(-EXDEV);
+			goto out_done;
+		}
+
 		if (lport->localport.port_state != FC_OBJSTATE_DELETED) {
 			lport = ERR_PTR(-EEXIST);
 			goto out_done;
@@ -296,6 +303,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo)
 
 		/* resume the lport */
 
+		lport->ops = ops;
 		lport->localport.port_role = pinfo->port_role;
 		lport->localport.port_id = pinfo->port_id;
 		lport->localport.port_state = FC_OBJSTATE_ONLINE;
@@ -356,7 +364,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
 	 * expired, we can simply re-enable the localport. Remoteports
 	 * and controller reconnections should resume naturally.
 	 */
-	newrec = nvme_fc_attach_to_unreg_lport(pinfo);
+	newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev);
 
 	/* found an lport, but something about its state is bad */
 	if (IS_ERR(newrec)) {
-- 
cgit v1.2.3


From 158bfb8888c3f2beab747ab4f7a8197639f03e54 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 3 Nov 2017 08:13:17 -0700
Subject: nvme-fc: decouple ns references from lldd references

In the lldd api, a lldd may unregister a remoteport (loss of connectivity
or driver unload) or localport (driver unload). The lldd must wait for the
remoteport_delete or localport_delete before completing its actions post
the unregister.  The xxx_deletes currently occur only when the xxxport
structure is fully freed after all references are removed. Thus the lldd
may be held hostage until an app or in-kernel entity that has a namespace
open finally closes so the namespace can be removed, the controller
removed, thus the transport objects, thus the lldd.

This patch decouples the transport and os-facing objects from the lldd
and the remoteport and localport. There is a point in all deletions
where the transport will no longer interact with the lldd on behalf of
a controller. That point centers around the association established
with the target/subsystem. It will access the lldd whenever it attempts
to create an association and while the association is active. New
associations may only be created if the remoteport is live (thus the
localport is live). It will not access the lldd after deleting the
association.

Therefore, the patch tracks the count of active controllers - those with
associations being created or that are active - on a remoteport. It also
tracks the number of remoteports that have active controllers, on a
a localport. When a remoteport is unregistered, as soon as there are no
active controllers, the lldd's remoteport_delete may be called and the
lldd may continue. Similarly, when a localport is unregistered, as soon
as there are no remoteports with active controllers, the localport_delete
callback may be made. This significantly speeds up unregistration with
the lldd.

The transport objects continue in suspended status with reconnect timers
running, and upon expiration, normal ref-counting will occur and the
objects will be freed. The transport object may still be held hostage
by the application/kernel module, but that is acceptable.

With this change, the lldd may be fully unloaded and reloaded, and
if registrations occur prior to the timeouts, the nvme controller and
namespaces will resume normally as if a link bounce.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index c49971d1a0515..da9296a1eb269 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -126,6 +126,7 @@ struct nvme_fc_lport {
 	struct device			*dev;	/* physical device for dma */
 	struct nvme_fc_port_template	*ops;
 	struct kref			ref;
+	atomic_t                        act_rport_cnt;
 } __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
 
 struct nvme_fc_rport {
@@ -138,6 +139,7 @@ struct nvme_fc_rport {
 	struct nvme_fc_lport		*lport;
 	spinlock_t			lock;
 	struct kref			ref;
+	atomic_t                        act_ctrl_cnt;
 	unsigned long			dev_loss_end;
 } __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
 
@@ -153,6 +155,7 @@ struct nvme_fc_ctrl {
 	struct nvme_fc_rport	*rport;
 	u32			cnum;
 
+	bool			assoc_active;
 	u64			association_id;
 
 	struct list_head	ctrl_list;	/* rport->ctrl_list */
@@ -243,9 +246,6 @@ nvme_fc_free_lport(struct kref *ref)
 	list_del(&lport->port_list);
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
-	/* let the LLDD know we've finished tearing it down */
-	lport->ops->localport_delete(&lport->localport);
-
 	ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
 	ida_destroy(&lport->endp_cnt);
 
@@ -400,6 +400,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
 	INIT_LIST_HEAD(&newrec->port_list);
 	INIT_LIST_HEAD(&newrec->endp_list);
 	kref_init(&newrec->ref);
+	atomic_set(&newrec->act_rport_cnt, 0);
 	newrec->ops = template;
 	newrec->dev = dev;
 	ida_init(&newrec->endp_cnt);
@@ -462,6 +463,9 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
 
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
+	if (atomic_read(&lport->act_rport_cnt) == 0)
+		lport->ops->localport_delete(&lport->localport);
+
 	nvme_fc_lport_put(lport);
 
 	return 0;
@@ -515,9 +519,6 @@ nvme_fc_free_rport(struct kref *ref)
 	list_del(&rport->endp_list);
 	spin_unlock_irqrestore(&nvme_fc_lock, flags);
 
-	/* let the LLDD know we've finished tearing it down */
-	lport->ops->remoteport_delete(&rport->remoteport);
-
 	ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
 
 	kfree(rport);
@@ -704,6 +705,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
 	INIT_LIST_HEAD(&newrec->ctrl_list);
 	INIT_LIST_HEAD(&newrec->ls_req_list);
 	kref_init(&newrec->ref);
+	atomic_set(&newrec->act_ctrl_cnt, 0);
 	spin_lock_init(&newrec->lock);
 	newrec->remoteport.localport = &lport->localport;
 	newrec->dev = lport->dev;
@@ -859,6 +861,9 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 
 	nvme_fc_abort_lsops(rport);
 
+	if (atomic_read(&rport->act_ctrl_cnt) == 0)
+		rport->lport->ops->remoteport_delete(portptr);
+
 	/*
 	 * release the reference, which will allow, if all controllers
 	 * go away, which should only occur after dev_loss_tmo occurs,
@@ -2639,6 +2644,61 @@ out_free_io_queues:
 	return ret;
 }
 
+static void
+nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_lport *lport = rport->lport;
+
+	atomic_inc(&lport->act_rport_cnt);
+}
+
+static void
+nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_lport *lport = rport->lport;
+	u32 cnt;
+
+	cnt = atomic_dec_return(&lport->act_rport_cnt);
+	if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED)
+		lport->ops->localport_delete(&lport->localport);
+}
+
+static int
+nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_rport *rport = ctrl->rport;
+	u32 cnt;
+
+	if (ctrl->assoc_active)
+		return 1;
+
+	ctrl->assoc_active = true;
+	cnt = atomic_inc_return(&rport->act_ctrl_cnt);
+	if (cnt == 1)
+		nvme_fc_rport_active_on_lport(rport);
+
+	return 0;
+}
+
+static int
+nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_rport *rport = ctrl->rport;
+	struct nvme_fc_lport *lport = rport->lport;
+	u32 cnt;
+
+	/* ctrl->assoc_active=false will be set independently */
+
+	cnt = atomic_dec_return(&rport->act_ctrl_cnt);
+	if (cnt == 0) {
+		if (rport->remoteport.port_state == FC_OBJSTATE_DELETED)
+			lport->ops->remoteport_delete(&rport->remoteport);
+		nvme_fc_rport_inactive_on_lport(rport);
+	}
+
+	return 0;
+}
+
 /*
  * This routine restarts the controller on the host side, and
  * on the link side, recreates the controller association.
@@ -2655,6 +2715,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
 		return -ENODEV;
 
+	if (nvme_fc_ctlr_active_on_rport(ctrl))
+		return -ENOTUNIQ;
+
 	/*
 	 * Create the admin queue
 	 */
@@ -2762,6 +2825,8 @@ out_delete_hw_queue:
 	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
 out_free_queue:
 	nvme_fc_free_queue(&ctrl->queues[0]);
+	ctrl->assoc_active = false;
+	nvme_fc_ctlr_inactive_on_rport(ctrl);
 
 	return ret;
 }
@@ -2777,6 +2842,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 {
 	unsigned long flags;
 
+	if (!ctrl->assoc_active)
+		return;
+	ctrl->assoc_active = false;
+
 	spin_lock_irqsave(&ctrl->lock, flags);
 	ctrl->flags |= FCCTRL_TERMIO;
 	ctrl->iocnt = 0;
@@ -2849,6 +2918,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 
 	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
 	nvme_fc_free_queue(&ctrl->queues[0]);
+
+	nvme_fc_ctlr_inactive_on_rport(ctrl);
 }
 
 static void
@@ -3048,6 +3119,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->rport = rport;
 	ctrl->dev = lport->dev;
 	ctrl->cnum = idx;
+	ctrl->assoc_active = false;
 
 	get_device(ctrl->dev);
 	kref_init(&ctrl->ref);
-- 
cgit v1.2.3


From 6ddcf0a30adc5080504ca66f474101e7ad247dd7 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 3 Nov 2017 09:33:30 -0700
Subject: lpfc: tie in to new dev_loss_tmo interface in nvme transport

This patch calls the new nvme transport routine for dev_loss_tmo
whenever the SCSI fc transport calls the lldd to make a dynamic
change to a remote ports dev_loss_tmo.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/lpfc/lpfc_attr.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index c17677f494afe..3e02bc3a7c3f7 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport)
 			continue;
 		if (ndlp->rport)
 			ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo;
+#if (IS_ENABLED(CONFIG_NVME_FC))
+		if (ndlp->nrport)
+			nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport,
+						       vport->cfg_devloss_tmo);
+#endif
 	}
 	spin_unlock_irq(shost->host_lock);
 }
-- 
cgit v1.2.3


From 43b92fd27aaef0f529c9321cfebbaec1d7b8f503 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Sun, 5 Nov 2017 08:43:01 +0000
Subject: nvmet-rdma: update queue list during ib_device removal

A NULL deref happens when nvmet_rdma_remove_one() is called more than once
(e.g. while connected via 2 ports).
The first call frees the queues related to the first ib_device but
doesn't remove them from the queue list.
While calling nvmet_rdma_remove_one() for the second ib_device it goes over
the full queue list again and we get the NULL deref.

Fixes: f1d4ef7d ("nvmet-rdma: register ib_client to not deadlock in device removal")
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grmberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/rdma.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 76d2bb793afe5..3333d417b248b 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1512,15 +1512,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = {
 
 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
 {
-	struct nvmet_rdma_queue *queue;
+	struct nvmet_rdma_queue *queue, *tmp;
 
 	/* Device is being removed, delete all queues using this device */
 	mutex_lock(&nvmet_rdma_queue_mutex);
-	list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
+	list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
+				 queue_list) {
 		if (queue->dev->device != ib_device)
 			continue;
 
 		pr_info("Removing queue %d\n", queue->idx);
+		list_del_init(&queue->queue_list);
 		__nvmet_rdma_queue_disconnect(queue);
 	}
 	mutex_unlock(&nvmet_rdma_queue_mutex);
-- 
cgit v1.2.3


From 1f61def985d896da5935ae85a69c79fc26c7952f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 6 Nov 2017 16:18:51 +0200
Subject: nvme-rdma: fix nvme_rdma_create_queue_ib error flow

QP object is created using rdma_cm api, therefore the destruction
should use the same api for symmetry.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 03644ecf68d29..32e21ab1ae520 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -493,7 +493,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 	return 0;
 
 out_destroy_qp:
-	ib_destroy_qp(queue->qp);
+	rdma_destroy_qp(queue->cm_id);
 out_destroy_ib_cq:
 	ib_free_cq(queue->ib_cq);
 out_put_dev:
-- 
cgit v1.2.3


From 18c53e40487f56369c3ba9331ec3597d9b48d97c Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Tue, 7 Nov 2017 21:10:22 +0900
Subject: nvmet: fix comment typos in admin-cmd.c

small typos fixed in admin-cmd.c

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c4a0bf36e7521..9a7d6917e190c 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -300,7 +300,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	}
 
 	/*
-	 * nuse = ncap = nsze isn't aways true, but we have no way to find
+	 * nuse = ncap = nsze isn't always true, but we have no way to find
 	 * that out from the underlying device.
 	 */
 	id->ncap = id->nuse = id->nsze =
@@ -424,7 +424,7 @@ out:
 }
 
 /*
- * A "mimimum viable" abort implementation: the command is mandatory in the
+ * A "minimum viable" abort implementation: the command is mandatory in the
  * spec, but we are not required to do any useful work.  We couldn't really
  * do a useful abort, so don't bother even with waiting for the command
  * to be exectuted and return immediately telling the command to abort
-- 
cgit v1.2.3


From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 7 Nov 2017 11:09:50 -0800
Subject: kthread: zero the kthread data structure

kthread() could bail out early before we initialize blkcg_css (if the
kthread is killed very early. Please see xchg() statement in kthread()),
which confuses free_kthread_struct. Instead of moving the blkcg_css
initialization early, we simply zero the whole 'self' data structure,
which doesn't sound much overhead.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info")
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/kthread.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index f87cd8b4eb2a1..8dbe2454cb1de 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -204,7 +204,7 @@ static int kthread(void *_create)
 	struct kthread *self;
 	int ret;
 
-	self = kmalloc(sizeof(*self), GFP_KERNEL);
+	self = kzalloc(sizeof(*self), GFP_KERNEL);
 	set_kthread_struct(self);
 
 	/* If user was SIGKILLed, I release the structure. */
@@ -220,13 +220,9 @@ static int kthread(void *_create)
 		do_exit(-ENOMEM);
 	}
 
-	self->flags = 0;
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_BLK_CGROUP
-	self->blkcg_css = NULL;
-#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
-- 
cgit v1.2.3


From 05b79413946d8b2b58999ea1ae844b6fc3c54f61 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Nov 2017 10:38:29 -0700
Subject: Revert "blk-mq: don't handle TAG_SHARED in restart"

This reverts commit 358a3a6bccb74da9d63a26b2dd5f09f1e9970e0b.

We have cases that aren't covered 100% in the drivers, so for now
we have to retain the shared tag restart loops.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 01a43fed6b8ce..6f4bdb8209f7d 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -68,17 +68,25 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return;
+		return false;
 
-	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
+
+		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_dec(&q->shared_hctx_restart);
+	} else
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
 	if (blk_mq_hctx_has_pending(hctx)) {
 		blk_mq_run_hw_queue(hctx, true);
-		return;
+		return true;
 	}
+
+	return false;
 }
 
 /*
@@ -362,6 +370,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return false;
 }
 
+/**
+ * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
+ * @pos:    loop cursor.
+ * @skip:   the list element that will not be examined. Iteration starts at
+ *          @skip->next.
+ * @head:   head of the list to examine. This list must have at least one
+ *          element, namely @skip.
+ * @member: name of the list_head structure within typeof(*pos).
+ */
+#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
+	for ((pos) = (skip);						\
+	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
+			(pos)->member.next, typeof(*pos), member) :	\
+	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
+	     (pos) != (skip); )
+
+/*
+ * Called after a driver tag has been freed to check whether a hctx needs to
+ * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
+ * queues in a round-robin fashion if the tag set of @hctx is shared with other
+ * hardware queues.
+ */
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+{
+	struct blk_mq_tags *const tags = hctx->tags;
+	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
+	struct request_queue *const queue = hctx->queue, *q;
+	struct blk_mq_hw_ctx *hctx2;
+	unsigned int i, j;
+
+	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		/*
+		 * If this is 0, then we know that no hardware queues
+		 * have RESTART marked. We're done.
+		 */
+		if (!atomic_read(&queue->shared_hctx_restart))
+			return;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
+					   tag_set_list) {
+			queue_for_each_hw_ctx(q, hctx2, i)
+				if (hctx2->tags == tags &&
+				    blk_mq_sched_restart_hctx(hctx2))
+					goto done;
+		}
+		j = hctx->queue_num + 1;
+		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
+			if (j == queue->nr_hw_queues)
+				j = 0;
+			hctx2 = queue->queue_hw_ctx[j];
+			if (hctx2->tags == tags &&
+			    blk_mq_sched_restart_hctx(hctx2))
+				break;
+		}
+done:
+		rcu_read_unlock();
+	} else {
+		blk_mq_sched_restart_hctx(hctx);
+	}
+}
+
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block)
 {
-- 
cgit v1.2.3


From 83f5f7ed72f316eda4c4c3833beebfe6578926f4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Nov 2017 11:15:37 -0700
Subject: block: kill bio_kmap/kunmap_irq()

There are no users of it anymore.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt |  4 ++--
 include/linux/bio.h            | 11 -----------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 9490f2845f069..01c0a03407cc1 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,7 +216,7 @@ may need to abort DMA operations and revert to PIO for the transfer, in
 which case a virtual mapping of the page is required. For SCSI it is also
 done in some scenarios where the low level driver cannot be trusted to
 handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions using the __bio_kmap_atomic and bio_kmap_irq
+kmaps as needed on such occasions using the bio_kmap_irq and friends
 routines as appropriate. A driver could also use the blk_queue_bounce()
 routine on its own to bounce highmem i/o to low memory for specific requests
 if so desired.
@@ -1137,7 +1137,7 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic and bio_kmap_irq to
+(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to
 temporarily map a bio into the virtual address space.
 
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9c75f58f6a501..1d7e63d7505f6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -573,17 +573,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 }
 #endif
 
-static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
-				   unsigned long *flags)
-{
-	return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags);
-}
-#define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
-
-#define bio_kmap_irq(bio, flags) \
-	__bio_kmap_irq((bio), (bio)->bi_iter, (flags))
-#define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
-
 /*
  * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
-- 
cgit v1.2.3


From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 8 Nov 2017 19:13:48 +0100
Subject: block: remove __bio_kmap_atomic

This helper doesn't buy us much over calling kmap_atomic directly.
In fact in the only caller it does a bit of useless work as the
caller already has the bvec at hand, and said caller would even
buggy for a multi-segment bio due to the use of this helper.

So just remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt      | 11 +++++------
 arch/xtensa/platforms/iss/simdisk.c |  4 ++--
 block/blk-settings.c                |  2 +-
 include/linux/bio.h                 | 12 ------------
 4 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 01c0a03407cc1..86927029a52db 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -216,10 +216,9 @@ may need to abort DMA operations and revert to PIO for the transfer, in
 which case a virtual mapping of the page is required. For SCSI it is also
 done in some scenarios where the low level driver cannot be trusted to
 handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions using the bio_kmap_irq and friends
-routines as appropriate. A driver could also use the blk_queue_bounce()
-routine on its own to bounce highmem i/o to low memory for specific requests
-if so desired.
+kmaps as needed on such occasions as appropriate. A driver could also use
+the blk_queue_bounce() routine on its own to bounce highmem i/o to low
+memory for specific requests if so desired.
 
 iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
 
@@ -1137,8 +1136,8 @@ use dma_map_sg for scatter gather) to be able to ship it to the driver. For
 PIO drivers (or drivers that need to revert to PIO transfer once in a
 while (IDE for example)), where the CPU is doing the actual data
 transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use __bio_kmap_atomic or similar to
-temporarily map a bio into the virtual address space.
+(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
+a bio into the virtual address space.
 
 
 8. Prior/Related/Impacted patches
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index c45b90bb93393..eacf1e433518b 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -110,13 +110,13 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio)
 	sector_t sector = bio->bi_iter.bi_sector;
 
 	bio_for_each_segment(bvec, bio, iter) {
-		char *buffer = __bio_kmap_atomic(bio, iter);
+		char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset;
 		unsigned len = bvec.bv_len >> SECTOR_SHIFT;
 
 		simdisk_transfer(dev, sector, len, buffer,
 				bio_data_dir(bio) == WRITE);
 		sector += len;
-		__bio_kunmap_atomic(buffer);
+		kunmap_atomic(buffer)
 	}
 
 	bio_endio(bio);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c525..48ebe6be07b75 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
  * Caveat:
  *    The driver that does this *must* be able to deal appropriately
  *    with buffers in "highmemory". This can be accomplished by either calling
- *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    kmap_atomic() to get a temporary kernel mapping, or by calling
  *    blk_queue_bounce() to create a buffer in normal memory.
  **/
 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1d7e63d7505f6..d4eec19a6d3c3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio)
  */
 #define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
 
-/*
- * queues that have highmem support enabled may still need to revert to
- * PIO transfers occasionally and thus map high pages temporarily. For
- * permanent PIO fall back, user is probably better off disabling highmem
- * I/O completely on that queue (see ide-dma for example)
- */
-#define __bio_kmap_atomic(bio, iter)				\
-	(kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) +	\
-		bio_iter_iovec((bio), (iter)).bv_offset)
-
-#define __bio_kunmap_atomic(addr)	kunmap_atomic(addr)
-
 /*
  * merge helpers etc
  */
-- 
cgit v1.2.3


From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 5 Nov 2017 10:36:31 +0300
Subject: block: pass full fmode_t to blk_verify_command

Use the obvious calling convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg.c            | 18 ++++++++----------
 block/scsi_ioctl.c     |  8 ++++----
 drivers/scsi/sg.c      |  2 +-
 include/linux/blkdev.h |  2 +-
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index ee1335c68de7f..452f94f1c5d42 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -137,7 +137,7 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				fmode_t has_write_perm)
+				fmode_t mode)
 {
 	struct scsi_request *req = scsi_req(rq);
 
@@ -152,7 +152,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(req->cmd, has_write_perm))
+		if (blk_verify_command(req->cmd, mode))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -206,7 +206,7 @@ bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -237,7 +237,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	if (IS_ERR(rq))
 		return rq;
 
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
 	if (ret)
 		goto out;
 
@@ -587,8 +587,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static int __bsg_write(struct bsg_device *bd, const char __user *buf,
-		       size_t count, ssize_t *bytes_written,
-		       fmode_t has_write_perm)
+		       size_t count, ssize_t *bytes_written, fmode_t mode)
 {
 	struct bsg_command *bc;
 	struct request *rq;
@@ -619,7 +618,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
+		rq = bsg_map_hdr(bd, &bc->hdr, mode);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -655,8 +654,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	bsg_set_block(bd, file);
 
 	bytes_written = 0;
-	ret = __bsg_write(bd, buf, count, &bytes_written,
-			  file->f_mode & FMODE_WRITE);
+	ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
 
 	*ppos = bytes_written;
 
@@ -915,7 +913,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 7440de44dd857..edcfff9745277 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -207,7 +207,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
 }
 
-int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+int blk_verify_command(unsigned char *cmd, fmode_t mode)
 {
 	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
 
@@ -220,7 +220,7 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
 		return 0;
 
 	/* Write-safe commands require a writable open */
-	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+	if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
 		return 0;
 
 	return -EPERM;
@@ -234,7 +234,7 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
 
 	if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(req->cmd, mode & FMODE_WRITE))
+	if (blk_verify_command(req->cmd, mode))
 		return -EPERM;
 
 	/*
@@ -469,7 +469,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(req->cmd, mode & FMODE_WRITE);
+	err = blk_verify_command(req->cmd, mode);
 	if (err)
 		goto error;
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0419c2298eabd..92fd870e13156 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
 	if (sfp->parentdp->device->type == TYPE_SCANNER)
 		return 0;
 
-	return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE);
+	return blk_verify_command(cmd, filp->f_mode);
 }
 
 static int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 225617dd0a3f4..1437ef4d80379 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1360,7 +1360,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
-extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
+extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
 
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
-- 
cgit v1.2.3


From 0c6af1ccd5fd9ac640aef01c8de0043837451a04 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 8 Nov 2017 09:11:22 +0800
Subject: blk-mq: put driver tag if dispatch budget can't be got

We have to put the driver tag if dispatch budget can't be got, otherwise
it might cause IO deadlock, especially in case that size of tags is very
small.

Fixes: de1482974080(blk-mq: introduce .get_budget and .put_budget in blk_mq_ops)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c501cbd0de93f..3d759bb8a5bb5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1080,8 +1080,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+			blk_mq_put_driver_tag(rq);
 			break;
+		}
 
 		list_del_init(&rq->queuelist);
 
-- 
cgit v1.2.3


From a47619b5c6ead6c2e10b8114a5e9291a1679a7c3 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Wed, 8 Nov 2017 10:59:03 +0100
Subject: nvme: compare NQN string with right size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copy subnqns using NVMF_NQN_SIZE as it is < 256

Signed-off-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 65fd2fc1ae3c2..8a60cc786220b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1810,7 +1810,7 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 
 	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
 	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
-		strcpy(ctrl->subnqn, id->subnqn);
+		strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE);
 		return;
 	}
 
-- 
cgit v1.2.3


From ab083b11f6f43af071f9b507e48d212bfbb2beb3 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Wed, 8 Nov 2017 10:59:04 +0100
Subject: nvme: fix eui_show() print format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix print formatting, but keep the original output to prevent user
breakage as suggested by Joe Perches.

Signed-off-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8a60cc786220b..6c79b73fe2c39 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2190,7 +2190,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%8phd\n", ns->eui);
+	return sprintf(buf, "%8ph\n", ns->eui);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 
-- 
cgit v1.2.3


From aba7afc5671c23beade64d10caf86e24a9105dab Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 8 Nov 2017 10:23:45 -0800
Subject: blk-mq: Avoid that request queue removal can trigger list corruption

Avoid that removal of a request queue sporadically triggers the
following warning:

list_del corruption. next->prev should be ffff8807d649b970, but was 6b6b6b6b6b6b6b6b
WARNING: CPU: 3 PID: 342 at lib/list_debug.c:56 __list_del_entry_valid+0x92/0xa0
Call Trace:
 process_one_work+0x11b/0x660
 worker_thread+0x3d/0x3b0
 kthread+0x129/0x140
 ret_from_fork+0x27/0x40

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8d1aa2d1008f..5e81dcf4690a6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -339,6 +339,7 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
+		cancel_delayed_work_sync(&q->requeue_work);
 		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
 	} else {
-- 
cgit v1.2.3


From dafc040ba6d813df225a25f8b4bc3eae2207a6be Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 8 Nov 2017 12:00:29 +0200
Subject: nvmet: remove redundant memset if failed to get_smart_log failed

We already allocated the buffer with kzalloc.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 9a7d6917e190c..dd087be395b9e 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -144,10 +144,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		}
 		smart_log = buf;
 		status = nvmet_get_smart_log(req, smart_log);
-		if (status) {
-			memset(buf, '\0', data_len);
+		if (status)
 			goto err;
-		}
 		break;
 	case NVME_LOG_FW_SLOT:
 		/*
-- 
cgit v1.2.3


From 4185f25acbc2a191341c7d8cf51e449698eceb06 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 8 Nov 2017 12:00:30 +0200
Subject: nvmet: remove redundant local variable

the status is either success or some status id and
we don't need a local variable for it.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index dd087be395b9e..90dcdc40ac71a 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
-	u16 status;
 	struct nvmet_ns *ns;
 	u64 host_reads, host_writes, data_units_read, data_units_written;
 
-	status = NVME_SC_SUCCESS;
 	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
 	if (!ns) {
-		status = NVME_SC_INVALID_NS;
 		pr_err("nvmet : Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
-		goto out;
+		return NVME_SC_INVALID_NS;
 	}
 
 	host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
@@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	put_unaligned_le64(host_writes, &slog->host_writes[0]);
 	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
 	nvmet_put_namespace(ns);
-out:
-	return status;
+
+	return NVME_SC_SUCCESS;
 }
 
 static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
-	u16 status;
 	u64 host_reads = 0, host_writes = 0;
 	u64 data_units_read = 0, data_units_written = 0;
 	struct nvmet_ns *ns;
 	struct nvmet_ctrl *ctrl;
 
-	status = NVME_SC_SUCCESS;
 	ctrl = req->sq->ctrl;
 
 	rcu_read_lock();
@@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 	put_unaligned_le64(host_writes, &slog->host_writes[0]);
 	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
 
-	return status;
+	return NVME_SC_SUCCESS;
 }
 
 static u16 nvmet_get_smart_log(struct nvmet_req *req,
-- 
cgit v1.2.3


From 38dabe210fbab4e7e8a03670ab3ba42f247ea08f Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:10 -0700
Subject: nvme: centralize AEN defines

All the transports were unnecessarilly duplicating the AEN request
accounting. This patch defines everything in one place.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c   |  2 +-
 drivers/nvme/host/fc.c     | 33 ++++++++++++---------------------
 drivers/nvme/host/nvme.h   |  1 -
 drivers/nvme/host/pci.c    | 16 +++-------------
 drivers/nvme/host/rdma.c   | 14 +++-----------
 drivers/nvme/target/loop.c | 14 +++-----------
 include/linux/nvme.h       |  8 ++++++++
 7 files changed, 30 insertions(+), 58 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6c79b73fe2c39..315087dfcd8de 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 
 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
 {
-	ctrl->event_limit = NVME_NR_AERS;
+	ctrl->event_limit = NVME_NR_AEN_COMMANDS;
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index da9296a1eb269..f3bfad47a95fb 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -30,15 +30,6 @@
 /* *************************** Data Structures/Defines ****************** */
 
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_FC_NR_AEN_COMMANDS	1
-#define NVME_FC_AQ_BLKMQ_DEPTH	\
-	(NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
-#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
-
 enum nvme_fc_queue_flags {
 	NVME_FC_Q_CONNECTED = (1 << 0),
 };
@@ -170,7 +161,7 @@ struct nvme_fc_ctrl {
 	u32			iocnt;
 	wait_queue_head_t	ioabort_wait;
 
-	struct nvme_fc_fcp_op	aen_ops[NVME_FC_NR_AEN_COMMANDS];
+	struct nvme_fc_fcp_op	aen_ops[NVME_NR_AEN_COMMANDS];
 
 	struct nvme_ctrl	ctrl;
 };
@@ -1546,7 +1537,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 	unsigned long flags;
 	int i, ret;
 
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
 			continue;
 
@@ -1816,7 +1807,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 	int i, ret;
 
 	aen_op = ctrl->aen_ops;
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz,
 						GFP_KERNEL);
 		if (!private)
@@ -1826,7 +1817,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 		sqe = &cmdiu->sqe;
 		ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
 				aen_op, (struct request *)NULL,
-				(AEN_CMDID_BASE + i));
+				(NVME_AQ_BLK_MQ_DEPTH + i));
 		if (ret) {
 			kfree(private);
 			return ret;
@@ -1839,7 +1830,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
 		memset(sqe, 0, sizeof(*sqe));
 		sqe->common.opcode = nvme_admin_async_event;
 		/* Note: core layer may overwrite the sqe.command_id value */
-		sqe->common.command_id = AEN_CMDID_BASE + i;
+		sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i;
 	}
 	return 0;
 }
@@ -1851,7 +1842,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
 	int i;
 
 	aen_op = ctrl->aen_ops;
-	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
 		if (!aen_op->fcp_req.private)
 			continue;
 
@@ -2402,7 +2393,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	bool terminating = false;
 	blk_status_t ret;
 
-	if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
+	if (aer_idx > NVME_NR_AEN_COMMANDS)
 		return;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
@@ -2722,16 +2713,16 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	 * Create the admin queue
 	 */
 
-	nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+	nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH);
 
 	ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
-				NVME_FC_AQ_BLKMQ_DEPTH);
+				NVME_AQ_BLK_MQ_DEPTH);
 	if (ret)
 		goto out_free_queue;
 
 	ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
-				NVME_FC_AQ_BLKMQ_DEPTH,
-				(NVME_FC_AQ_BLKMQ_DEPTH / 4));
+				NVME_AQ_BLK_MQ_DEPTH,
+				(NVME_AQ_BLK_MQ_DEPTH / 4));
 	if (ret)
 		goto out_delete_hw_queue;
 
@@ -3145,7 +3136,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
 	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
-	ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
 	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
 	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f7c21cea94853..a6d750cfa6b2a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -313,7 +313,6 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 		bool send);
 
-#define NVME_NR_AERS	1
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 void nvme_queue_async_events(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 32c413ec818c9..c3dfd84feef72 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -35,12 +35,6 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AERS)
-
 #define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
 
 static int use_threaded_interrupts;
@@ -956,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvmeq->qid == 0 &&
-			cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) {
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
@@ -1057,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx;
+	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	__nvme_submit_cmd(nvmeq, &c);
@@ -1524,11 +1518,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.ops = &nvme_mq_admin_ops;
 		dev->admin_tagset.nr_hw_queues = 1;
 
-		/*
-		 * Subtract one to leave an empty queue entry for 'Full Queue'
-		 * condition. See NVM-Express 1.2 specification, section 4.1.2.
-		 */
-		dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1;
+		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 32e21ab1ae520..008791bbdfb32 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -41,14 +41,6 @@
 
 #define NVME_RDMA_MAX_INLINE_SEGMENTS	1
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_RDMA_NR_AEN_COMMANDS      1
-#define NVME_RDMA_AQ_BLKMQ_DEPTH       \
-	(NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
-
 struct nvme_rdma_device {
 	struct ib_device	*dev;
 	struct ib_pd		*pd;
@@ -690,7 +682,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set = &ctrl->admin_tag_set;
 		memset(set, 0, sizeof(*set));
 		set->ops = &nvme_rdma_admin_mq_ops;
-		set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		set->reserved_tags = 2; /* connect + keep-alive */
 		set->numa_node = NUMA_NO_NODE;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
@@ -1318,7 +1310,7 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->common.opcode = nvme_admin_async_event;
-	cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 	cmd->common.flags |= NVME_CMD_SGL_METABUF;
 	nvme_rdma_set_sg_null(cmd);
 
@@ -1380,7 +1372,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
-			cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	else
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index bc95c6ed531af..7258b796f209c 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -23,14 +23,6 @@
 
 #define NVME_LOOP_MAX_SEGMENTS		256
 
-/*
- * We handle AEN commands ourselves and don't even let the
- * block layer know about them.
- */
-#define NVME_LOOP_NR_AEN_COMMANDS	1
-#define NVME_LOOP_AQ_BLKMQ_DEPTH	\
-	(NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
-
 struct nvme_loop_iod {
 	struct nvme_request	nvme_req;
 	struct nvme_command	cmd;
@@ -112,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
-			cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
+			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	} else {
@@ -200,7 +192,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 
 	memset(&iod->cmd, 0, sizeof(iod->cmd));
 	iod->cmd.common.opcode = nvme_admin_async_event;
-	iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH;
+	iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
 
 	if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
@@ -356,7 +348,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
 
 	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
 	ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
-	ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
 	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
 	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index fd1d4508a6129..89ffa7eed2fd9 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -90,6 +90,14 @@ enum {
 };
 
 #define NVME_AQ_DEPTH		32
+#define NVME_NR_AEN_COMMANDS	1
+#define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
+
+/*
+ * Subtract one to leave an empty queue entry for 'Full Queue' condition. See
+ * NVM-Express 1.2 specification, section 4.1.2.
+ */
+#define NVME_AQ_MQ_TAG_DEPTH	(NVME_AQ_BLK_MQ_DEPTH - 1)
 
 enum {
 	NVME_REG_CAP	= 0x0000,	/* Controller Capabilities */
-- 
cgit v1.2.3


From 08e1507544839b98fc3732aea935e70ed9c209ec Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:11 -0700
Subject: nvme-fc: remove unused "queue_size" field

This was being saved in a structure, but never used anywhere. The queue
size is obtained through other means, so there's no reason to duplicate
this without a user for it.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f3bfad47a95fb..3aa5950291925 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -43,7 +43,6 @@ struct nvme_fc_queue {
 	struct device		*dev;
 	struct blk_mq_hw_ctx	*hctx;
 	void			*lldd_handle;
-	int			queue_size;
 	size_t			cmnd_capsule_len;
 	u32			qnum;
 	u32			rqcnt;
@@ -1886,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 }
 
 static void
-nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
+nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx)
 {
 	struct nvme_fc_queue *queue;
 
@@ -1902,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
 	else
 		queue->cmnd_capsule_len = sizeof(struct nvme_command);
 
-	queue->queue_size = queue_size;
-
 	/*
 	 * Considered whether we should allocate buffers for all SQEs
 	 * and CQEs and dma map them - mapping their respective entries
@@ -2027,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
 	int i;
 
 	for (i = 1; i < ctrl->ctrl.queue_count; i++)
-		nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+		nvme_fc_init_queue(ctrl, i);
 }
 
 static void
@@ -2713,7 +2710,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 	 * Create the admin queue
 	 */
 
-	nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH);
+	nvme_fc_init_queue(ctrl, 0);
 
 	ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
 				NVME_AQ_BLK_MQ_DEPTH);
-- 
cgit v1.2.3


From ad22c355b707a8d8d48e282aadc01c0b0604b2e9 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:12 -0700
Subject: nvme: remove handling of multiple AEN requests

The driver can handle tracking only one AEN request, so this patch
removes handling for multiple ones.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: James Smart  <james.smart@broadcom.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c   | 28 +++-------------------------
 drivers/nvme/host/fc.c     |  9 +++------
 drivers/nvme/host/nvme.h   |  3 +--
 drivers/nvme/host/pci.c    |  4 ++--
 drivers/nvme/host/rdma.c   |  5 +----
 drivers/nvme/target/loop.c |  2 +-
 6 files changed, 11 insertions(+), 40 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 315087dfcd8de..dedbf12847b6f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2670,15 +2670,7 @@ static void nvme_async_event_work(struct work_struct *work)
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, async_event_work);
 
-	spin_lock_irq(&ctrl->lock);
-	while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) {
-		int aer_idx = --ctrl->event_limit;
-
-		spin_unlock_irq(&ctrl->lock);
-		ctrl->ops->submit_async_event(ctrl, aer_idx);
-		spin_lock_irq(&ctrl->lock);
-	}
-	spin_unlock_irq(&ctrl->lock);
+	ctrl->ops->submit_async_event(ctrl);
 }
 
 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
@@ -2745,22 +2737,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res)
 {
 	u32 result = le32_to_cpu(res->u32);
-	bool done = true;
 
-	switch (le16_to_cpu(status) >> 1) {
-	case NVME_SC_SUCCESS:
-		done = false;
-		/*FALLTHRU*/
-	case NVME_SC_ABORT_REQ:
-		++ctrl->event_limit;
-		if (ctrl->state == NVME_CTRL_LIVE)
-			queue_work(nvme_wq, &ctrl->async_event_work);
-		break;
-	default:
-		break;
-	}
-
-	if (done)
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
 
 	switch (result & 0xff07) {
@@ -2774,12 +2752,12 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	default:
 		dev_warn(ctrl->device, "async event result %08x\n", result);
 	}
+	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 
 void nvme_queue_async_events(struct nvme_ctrl *ctrl)
 {
-	ctrl->event_limit = NVME_NR_AEN_COMMANDS;
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 EXPORT_SYMBOL_GPL(nvme_queue_async_events);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 3aa5950291925..6eb460b117d6f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2382,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 }
 
 static void
-nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+nvme_fc_submit_async_event(struct nvme_ctrl *arg)
 {
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
 	struct nvme_fc_fcp_op *aen_op;
@@ -2390,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	bool terminating = false;
 	blk_status_t ret;
 
-	if (aer_idx > NVME_NR_AEN_COMMANDS)
-		return;
-
 	spin_lock_irqsave(&ctrl->lock, flags);
 	if (ctrl->flags & FCCTRL_TERMIO)
 		terminating = true;
@@ -2401,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	if (terminating)
 		return;
 
-	aen_op = &ctrl->aen_ops[aer_idx];
+	aen_op = &ctrl->aen_ops[0];
 
 	ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
 					NVMEFC_FCP_NODATA);
 	if (ret)
 		dev_err(ctrl->ctrl.device,
-			"failed async event work [%d]\n", aer_idx);
+			"failed async event work\n");
 }
 
 static void
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a6d750cfa6b2a..b55c97ecea31a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -162,7 +162,6 @@ struct nvme_ctrl {
 	u16 nssa;
 	u16 nr_streams;
 	atomic_t abort_limit;
-	u8 event_limit;
 	u8 vwc;
 	u32 vs;
 	u32 sgls;
@@ -237,7 +236,7 @@ struct nvme_ctrl_ops {
 	int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
 	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 	void (*free_ctrl)(struct nvme_ctrl *ctrl);
-	void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx);
+	void (*submit_async_event)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c3dfd84feef72..429d56f1a19e8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1043,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return __nvme_poll(nvmeq, tag);
 }
 
-static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
+static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
 	struct nvme_queue *nvmeq = dev->queues[0];
@@ -1051,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
-	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx;
+	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
 
 	spin_lock_irq(&nvmeq->q_lock);
 	__nvme_submit_cmd(nvmeq, &c);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 008791bbdfb32..c8d854474a5b6 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1293,7 +1293,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
 	return queue->ctrl->tag_set.tags[queue_idx - 1];
 }
 
-static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
 	struct nvme_rdma_queue *queue = &ctrl->queues[0];
@@ -1303,9 +1303,6 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	struct ib_sge sge;
 	int ret;
 
-	if (WARN_ON_ONCE(aer_idx != 0))
-		return;
-
 	ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
 
 	memset(cmd, 0, sizeof(*cmd));
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 7258b796f209c..f40e70eb4a380 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -184,7 +184,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
-static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
 {
 	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
 	struct nvme_loop_queue *queue = &ctrl->queues[0];
-- 
cgit v1.2.3


From d99ca609a1b55f87a5e62a11ed70e4d091d815f0 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:13 -0700
Subject: nvme: unexport starting async event work

Async event work is for core use only and should not be called directly
from drivers.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 8 +-------
 drivers/nvme/host/nvme.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dedbf12847b6f..c135d0aeebd78 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2756,12 +2756,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 }
 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
 
-void nvme_queue_async_events(struct nvme_ctrl *ctrl)
-{
-	queue_work(nvme_wq, &ctrl->async_event_work);
-}
-EXPORT_SYMBOL_GPL(nvme_queue_async_events);
-
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
 	nvme_stop_keep_alive(ctrl);
@@ -2778,7 +2772,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
 
 	if (ctrl->queue_count > 1) {
 		nvme_queue_scan(ctrl);
-		nvme_queue_async_events(ctrl);
+		queue_work(nvme_wq, &ctrl->async_event_work);
 		nvme_start_queues(ctrl);
 	}
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b55c97ecea31a..151062573ece1 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -314,7 +314,6 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
-void nvme_queue_async_events(struct nvme_ctrl *ctrl);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
-- 
cgit v1.2.3


From e3d7874dcf175cca2dca7795d6453f637ad8ba9b Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Nov 2017 15:13:14 -0700
Subject: nvme: send uevent for some asynchronous events

This will give udev a chance to observe and handle asynchronous event
notifications and clear the log to unmask future events of the same type.
The driver will create a change uevent of the asyncronuos event result
before submitting the next AEN request to the device if a completed AEN
event is of type error, smart, command set or vendor specific,

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Guan Junxiong <guanjunxiong@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     |  4 ++++
 3 files changed, 33 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c135d0aeebd78..683d890d73fa3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2665,11 +2665,28 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
 
+static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+{
+	char *envp[2] = { NULL, NULL };
+	u32 aen_result = ctrl->aen_result;
+
+	ctrl->aen_result = 0;
+	if (!aen_result)
+		return;
+
+	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+	if (!envp[0])
+		return;
+	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+	kfree(envp[0]);
+}
+
 static void nvme_async_event_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, async_event_work);
 
+	nvme_aen_uevent(ctrl);
 	ctrl->ops->submit_async_event(ctrl);
 }
 
@@ -2741,6 +2758,17 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
 		return;
 
+	switch (result & 0x7) {
+	case NVME_AER_ERROR:
+	case NVME_AER_SMART:
+	case NVME_AER_CSS:
+	case NVME_AER_VS:
+		ctrl->aen_result = result;
+		break;
+	default:
+		break;
+	}
+
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
 		dev_info(ctrl->device, "rescanning\n");
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 151062573ece1..7b9cc7d616b75 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -168,6 +168,7 @@ struct nvme_ctrl {
 	u16 kas;
 	u8 npss;
 	u8 apsta;
+	u32 aen_result;
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 89ffa7eed2fd9..aea87f0d917b3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -428,6 +428,10 @@ enum {
 };
 
 enum {
+	NVME_AER_ERROR			= 0,
+	NVME_AER_SMART			= 1,
+	NVME_AER_CSS			= 6,
+	NVME_AER_VS			= 7,
 	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
 };
-- 
cgit v1.2.3


From 03e0f3a65e4da497c3b7b213c68943cbc73a2e34 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 9 Nov 2017 19:32:07 +0800
Subject: nvme-pci: avoid dereference of symbol from unloaded module

The 'remove_work' may be scheduled to run after nvme_remove()
returns since we can't simply cancel it in nvme_remove() for
avoiding deadlock. Once nvme_remove() returns, this module(nvme)
can be unloaded.

On the other hand, nvme_put_ctrl() calls ctr->ops->free_ctrl
which may point to nvme_pci_free_ctrl() in unloaded module.

This patch avoids this issue by queuing 'remove_work' via 'nvme_wq',
and flush this worqueue in nvme_exit() as suggested by Sagi.

Suggested-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 429d56f1a19e8..762b8402e04c5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2280,7 +2280,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 
 	nvme_get_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, false);
-	if (!schedule_work(&dev->remove_work))
+	if (!queue_work(nvme_wq, &dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
 
@@ -2703,6 +2703,7 @@ static int __init nvme_init(void)
 static void __exit nvme_exit(void)
 {
 	pci_unregister_driver(&nvme_driver);
+	flush_workqueue(nvme_wq);
 	_nvme_check_size();
 }
 
-- 
cgit v1.2.3


From 5e62d5c993e6889cd314d5b5de6b670152109a0e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 14:29:58 +0100
Subject: nvmet: better data length validation

Currently the NVMe target stores the expexted data length in req->data_len
and uses that for data transfer decisions, but that does not take the
actual transfer length in the SGLs into account.  So this adds a new
transfer_len field, into which the transport drivers store the actual
transfer length.  We then check the two match before actually executing
the command.

The FC transport driver already had such a field, which is removed in
favour of the common one.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c  | 10 ++++++++++
 drivers/nvme/target/fc.c    | 32 ++++++++++++--------------------
 drivers/nvme/target/loop.c  |  3 ++-
 drivers/nvme/target/nvmet.h |  4 ++++
 drivers/nvme/target/rdma.c  | 10 ++++++----
 5 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index da088293eafc1..22a2a2bb40f9e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -501,6 +501,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->ops = ops;
 	req->sg = NULL;
 	req->sg_cnt = 0;
+	req->transfer_len = 0;
 	req->rsp->status = 0;
 
 	/* no support for fused commands yet */
@@ -550,6 +551,15 @@ void nvmet_req_uninit(struct nvmet_req *req)
 }
 EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 
+void nvmet_req_execute(struct nvmet_req *req)
+{
+	if (unlikely(req->data_len != req->transfer_len))
+		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
+	else
+		req->execute(req);
+}
+EXPORT_SYMBOL_GPL(nvmet_req_execute);
+
 static inline bool nvmet_cc_en(u32 cc)
 {
 	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 5f2570f5dfa90..739b8feadc7de 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod {
 	dma_addr_t			rspdma;
 	struct scatterlist		*data_sg;
 	int				data_sg_cnt;
-	u32				total_length;
 	u32				offset;
 	enum nvmet_fcp_datadir		io_dir;
 	bool				active;
@@ -1700,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
 	u32 page_len, length;
 	int i = 0;
 
-	length = fod->total_length;
+	length = fod->req.transfer_len;
 	nent = DIV_ROUND_UP(length, PAGE_SIZE);
 	sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
 	if (!sg)
@@ -1789,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 	u32 rsn, rspcnt, xfr_length;
 
 	if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP)
-		xfr_length = fod->total_length;
+		xfr_length = fod->req.transfer_len;
 	else
 		xfr_length = fod->offset;
 
@@ -1815,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 	rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
 	if (!(rspcnt % fod->queue->ersp_ratio) ||
 	    sqe->opcode == nvme_fabrics_command ||
-	    xfr_length != fod->total_length ||
+	    xfr_length != fod->req.transfer_len ||
 	    (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
 	    (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
 	    queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head)))
@@ -1892,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
 
 	tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
-			(fod->total_length - fod->offset));
+			(fod->req.transfer_len - fod->offset));
 	fcpreq->transfer_length = tlen;
 	fcpreq->transferred_length = 0;
 	fcpreq->fcp_error = 0;
@@ -1906,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	 * combined xfr with response.
 	 */
 	if ((op == NVMET_FCOP_READDATA) &&
-	    ((fod->offset + fcpreq->transfer_length) == fod->total_length) &&
+	    ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) &&
 	    (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) {
 		fcpreq->op = NVMET_FCOP_READDATA_RSP;
 		nvmet_fc_prep_fcp_rsp(tgtport, fod);
@@ -1986,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		}
 
 		fod->offset += fcpreq->transferred_length;
-		if (fod->offset != fod->total_length) {
+		if (fod->offset != fod->req.transfer_len) {
 			spin_lock_irqsave(&fod->flock, flags);
 			fod->writedataactive = true;
 			spin_unlock_irqrestore(&fod->flock, flags);
@@ -1998,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		}
 
 		/* data transfer complete, resume with nvmet layer */
-
-		fod->req.execute(&fod->req);
-
+		nvmet_req_execute(&fod->req);
 		break;
 
 	case NVMET_FCOP_READDATA:
@@ -2023,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		}
 
 		fod->offset += fcpreq->transferred_length;
-		if (fod->offset != fod->total_length) {
+		if (fod->offset != fod->req.transfer_len) {
 			/* transfer the next chunk */
 			nvmet_fc_transfer_fcp_data(tgtport, fod,
 						NVMET_FCOP_READDATA);
@@ -2160,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 
 	fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done;
 
-	fod->total_length = be32_to_cpu(cmdiu->data_len);
+	fod->req.transfer_len = be32_to_cpu(cmdiu->data_len);
 	if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) {
 		fod->io_dir = NVMET_FCP_WRITE;
 		if (!nvme_is_write(&cmdiu->sqe))
@@ -2171,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 			goto transport_error;
 	} else {
 		fod->io_dir = NVMET_FCP_NODATA;
-		if (fod->total_length)
+		if (fod->req.transfer_len)
 			goto transport_error;
 	}
 
@@ -2179,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	fod->req.rsp = &fod->rspiubuf.cqe;
 	fod->req.port = fod->queue->port;
 
-	/* ensure nvmet handlers will set cmd handler callback */
-	fod->req.execute = NULL;
-
 	/* clear any response payload */
 	memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
 
@@ -2201,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	/* keep a running counter of tail position */
 	atomic_inc(&fod->queue->sqtail);
 
-	if (fod->total_length) {
+	if (fod->req.transfer_len) {
 		ret = nvmet_fc_alloc_tgt_pgs(fod);
 		if (ret) {
 			nvmet_req_complete(&fod->req, ret);
@@ -2224,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	 * can invoke the nvmet_layer now. If read data, cmd completion will
 	 * push the data
 	 */
-
-	fod->req.execute(&fod->req);
-
+	nvmet_req_execute(&fod->req);
 	return;
 
 transport_error:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index f40e70eb4a380..96d390416789b 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -127,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work)
 	struct nvme_loop_iod *iod =
 		container_of(work, struct nvme_loop_iod, work);
 
-	iod->req.execute(&iod->req);
+	nvmet_req_execute(&iod->req);
 }
 
 static enum blk_eh_timer_return
@@ -176,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 		iod->req.sg = iod->sg_table.sgl;
 		iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
+		iod->req.transfer_len = blk_rq_bytes(req);
 	}
 
 	blk_mq_start_request(req);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index e342f02845c19..194ebffc688c3 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -223,7 +223,10 @@ struct nvmet_req {
 	struct bio		inline_bio;
 	struct bio_vec		inline_bvec[NVMET_MAX_INLINE_BIOVEC];
 	int			sg_cnt;
+	/* data length as parsed from the command: */
 	size_t			data_len;
+	/* data length as parsed from the SGL descriptor: */
+	size_t			transfer_len;
 
 	struct nvmet_port	*port;
 
@@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
 void nvmet_req_uninit(struct nvmet_req *req);
+void nvmet_req_execute(struct nvmet_req *req);
 void nvmet_req_complete(struct nvmet_req *req, u16 status);
 
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3333d417b248b..49912909c2986 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p)
 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
 {
 	return nvme_is_write(rsp->req.cmd) &&
-		rsp->req.data_len &&
+		rsp->req.transfer_len &&
 		!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 }
 
 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
 {
 	return !nvme_is_write(rsp->req.cmd) &&
-		rsp->req.data_len &&
+		rsp->req.transfer_len &&
 		!rsp->req.rsp->status &&
 		!(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 }
@@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 		return;
 	}
 
-	rsp->req.execute(&rsp->req);
+	nvmet_req_execute(&rsp->req);
 }
 
 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
@@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 
 	nvmet_rdma_use_inline_sg(rsp, len, off);
 	rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
+	rsp->req.transfer_len += len;
 	return 0;
 }
 
@@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 			nvmet_data_dir(&rsp->req));
 	if (ret < 0)
 		return NVME_SC_INTERNAL;
+	rsp->req.transfer_len += len;
 	rsp->n_rdma += ret;
 
 	if (invalidate) {
@@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 				queue->cm_id->port_num, &rsp->read_cqe, NULL))
 			nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
 	} else {
-		rsp->req.execute(&rsp->req);
+		nvmet_req_execute(&rsp->req);
 	}
 
 	return true;
-- 
cgit v1.2.3


From e454d122e22826589cfa08788a802ab09d4fae24 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 14:29:27 +0100
Subject: nvmet: kill nvmet_inline_bio_init

Much easier to just opencode this helper.  Also use ARRAY_SIZE instead of
passing the inline bvec array size manually.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@rimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index db632818777dd..0a4372a016f21 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
 			req->ns->blksize_shift;
 }
 
-static void nvmet_inline_bio_init(struct nvmet_req *req)
-{
-	struct bio *bio = &req->inline_bio;
-
-	bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC);
-}
-
 static void nvmet_execute_rw(struct nvmet_req *req)
 {
 	int sg_cnt = req->sg_cnt;
+	struct bio *bio = &req->inline_bio;
 	struct scatterlist *sg;
-	struct bio *bio;
 	sector_t sector;
 	blk_qc_t cookie;
 	int op, op_flags = 0, i;
@@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 	sector = le64_to_cpu(req->cmd->rw.slba);
 	sector <<= (req->ns->blksize_shift - 9);
 
-	nvmet_inline_bio_init(req);
-	bio = &req->inline_bio;
+	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_private = req;
@@ -99,11 +91,9 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 static void nvmet_execute_flush(struct nvmet_req *req)
 {
-	struct bio *bio;
-
-	nvmet_inline_bio_init(req);
-	bio = &req->inline_bio;
+	struct bio *bio = &req->inline_bio;
 
+	bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
 	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
-- 
cgit v1.2.3


From eb619fdb2d4cb8b3d3419e9113921e87e7daf557 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Nov 2017 08:32:43 -0700
Subject: blk-mq: fix issue with shared tag queue re-running

This patch attempts to make the case of hctx re-running on driver tag
failure more robust. Without this patch, it's pretty easy to trigger a
stall condition with shared tags. An example is using null_blk like
this:

modprobe null_blk queue_mode=2 nr_devices=4 shared_tags=1 submit_queues=1 hw_queue_depth=1

which sets up 4 devices, sharing the same tag set with a depth of 1.
Running a fio job ala:

[global]
bs=4k
rw=randread
norandommap
direct=1
ioengine=libaio
iodepth=4

[nullb0]
filename=/dev/nullb0
[nullb1]
filename=/dev/nullb1
[nullb2]
filename=/dev/nullb2
[nullb3]
filename=/dev/nullb3

will inevitably end with one or more threads being stuck waiting for a
scheduler tag. That IO is then stuck forever, until someone else
triggers a run of the queue.

Ensure that we always re-run the hardware queue, if the driver tag we
were waiting for got freed before we added our leftover request entries
back on the dispatch list.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         | 85 ++++++++++++++++++++++++++++----------------------
 include/linux/blk-mq.h |  5 ++-
 3 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7f4a1ba532afc..bb7f084152033 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -179,7 +179,6 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
-	HCTX_STATE_NAME(TAG_WAITING),
 	HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3d759bb8a5bb5..fed8165973a38 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -998,49 +998,64 @@ done:
 	return rq->tag != -1;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
-				void *key)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+				int flags, void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->entry);
-	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+				     struct request *rq)
 {
+	struct blk_mq_hw_ctx *this_hctx = *hctx;
+	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
 	struct sbq_wait_state *ws;
 
+	if (!list_empty_careful(&wait->entry))
+		return false;
+
+	spin_lock(&this_hctx->lock);
+	if (!list_empty(&wait->entry)) {
+		spin_unlock(&this_hctx->lock);
+		return false;
+	}
+
+	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+	add_wait_queue(&ws->wait, wait);
+
 	/*
-	 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
-	 * The thread which wins the race to grab this bit adds the hardware
-	 * queue to the wait queue.
+	 * It's possible that a tag was freed in the window between the
+	 * allocation failure and adding the hardware queue to the wait
+	 * queue.
 	 */
-	if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
-	    test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+		spin_unlock(&this_hctx->lock);
 		return false;
-
-	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-	ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+	}
 
 	/*
-	 * As soon as this returns, it's no longer safe to fiddle with
-	 * hctx->dispatch_wait, since a completion can wake up the wait queue
-	 * and unlock the bit.
+	 * We got a tag, remove ourselves from the wait queue to ensure
+	 * someone else gets the wakeup.
 	 */
-	add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+	spin_lock_irq(&ws->wait.lock);
+	list_del_init(&wait->entry);
+	spin_unlock_irq(&ws->wait.lock);
+	spin_unlock(&this_hctx->lock);
 	return true;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-		bool got_budget)
+			     bool got_budget)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq, *nxt;
+	bool no_tag = false;
 	int errors, queued;
 
 	if (list_empty(list))
@@ -1060,22 +1075,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
 			/*
 			 * The initial allocation attempt failed, so we need to
-			 * rerun the hardware queue when a tag is freed.
+			 * rerun the hardware queue when a tag is freed. The
+			 * waitqueue takes care of that. If the queue is run
+			 * before we add this entry back on the dispatch list,
+			 * we'll re-run it below.
 			 */
-			if (!blk_mq_dispatch_wait_add(hctx)) {
-				if (got_budget)
-					blk_mq_put_dispatch_budget(hctx);
-				break;
-			}
-
-			/*
-			 * It's possible that a tag was freed in the window
-			 * between the allocation failure and adding the
-			 * hardware queue to the wait queue.
-			 */
-			if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
+				no_tag = true;
 				break;
 			}
 		}
@@ -1140,10 +1148,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * it is no longer set that means that it was cleared by another
 		 * thread and hence that a queue rerun is needed.
 		 *
-		 * If TAG_WAITING is set that means that an I/O scheduler has
-		 * been configured and another thread is waiting for a driver
-		 * tag. To guarantee fairness, do not rerun this hardware queue
-		 * but let the other thread grab the driver tag.
+		 * If 'no_tag' is set, that means that we failed getting
+		 * a driver tag with an I/O scheduler attached. If our dispatch
+		 * waitqueue is no longer active, ensure that we run the queue
+		 * AFTER adding our entries back to the list.
 		 *
 		 * If no I/O scheduler has been configured it is possible that
 		 * the hardware queue got stopped and restarted before requests
@@ -1155,8 +1163,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) &&
-		    !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+		if (!blk_mq_sched_needs_restart(hctx) ||
+		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
 			blk_mq_run_hw_queue(hctx, true);
 	}
 
@@ -2020,6 +2028,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 
 	hctx->nr_ctx = 0;
 
+	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
 	if (set->ops->init_hctx &&
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
 		goto free_bitmap;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 674641527da79..4ae987c2352cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,7 +35,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-	wait_queue_entry_t		dispatch_wait;
+	wait_queue_entry_t	dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
@@ -181,8 +181,7 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
-	BLK_MQ_S_TAG_WAITING	= 3,
-	BLK_MQ_S_START_ON_RUN	= 4,
+	BLK_MQ_S_START_ON_RUN	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3


From 055f6e18e08f5b7fd98171fce857a0bad87a919d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 9 Nov 2017 10:49:53 -0800
Subject: block: Make q_usage_counter also track legacy requests

This patch makes it possible to pause request allocation for
the legacy block layer by calling blk_mq_freeze_queue() and
blk_mq_unfreeze_queue().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[ bvanassche: Combined two patches into one, edited a comment and made sure
  REQ_NOWAIT is handled properly in blk_old_get_request() ]
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 12 ++++++++++++
 block/blk-mq.c   | 10 ++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5e81dcf4690a6..a4362849059af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -612,6 +612,9 @@ void blk_set_queue_dying(struct request_queue *q)
 		}
 		spin_unlock_irq(q->queue_lock);
 	}
+
+	/* Make blk_queue_enter() reexamine the DYING flag. */
+	wake_up_all(&q->mq_freeze_wq);
 }
 EXPORT_SYMBOL_GPL(blk_set_queue_dying);
 
@@ -1398,16 +1401,22 @@ static struct request *blk_old_get_request(struct request_queue *q,
 					   unsigned int op, gfp_t gfp_mask)
 {
 	struct request *rq;
+	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
 
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
+	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
+			      (op & REQ_NOWAIT));
+	if (ret)
+		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
 	rq = get_request(q, op, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
+		blk_queue_exit(q);
 		return rq;
 	}
 
@@ -1579,6 +1588,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 		blk_free_request(rl, req);
 		freed_request(rl, sync, rq_flags);
 		blk_put_rl(rl);
+		blk_queue_exit(q);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1860,8 +1870,10 @@ get_rq:
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
+	blk_queue_enter_live(q);
 	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
+		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
 		if (PTR_ERR(req) == -ENOMEM)
 			bio->bi_status = BLK_STS_RESOURCE;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fed8165973a38..7173d4bd64afd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -126,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q)
 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
-		blk_mq_run_hw_queues(q, false);
+		if (q->mq_ops)
+			blk_mq_run_hw_queues(q, false);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -256,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		if (blk_mq_hw_queue_mapped(hctx))
 			blk_mq_tag_wakeup_all(hctx->tags, true);
-
-	/*
-	 * If we are called because the queue has now been marked as
-	 * dying, we need to ensure that processes currently waiting on
-	 * the queue are notified as well.
-	 */
-	wake_up_all(&q->mq_freeze_wq);
 }
 
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
-- 
cgit v1.2.3


From 6a15674d1e90917f1723a814e2e8c949000440f7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:54 -0800
Subject: block: Introduce blk_get_request_flags()

A side effect of this patch is that the GFP mask that is passed to
several allocation functions in the legacy block layer is changed
from GFP_KERNEL into __GFP_DIRECT_RECLAIM.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 50 +++++++++++++++++++++++++++++++++++---------------
 include/linux/blkdev.h |  3 +++
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a4362849059af..0f7093dfc0102 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1160,7 +1160,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * @rl: request list to allocate from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLQ_MQ_REQ_* flags
  *
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
@@ -1170,7 +1170,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				     struct bio *bio, unsigned int flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1179,6 +1179,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = op_is_sync(op);
 	int may_queue;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	lockdep_assert_held(q->queue_lock);
@@ -1339,7 +1341,7 @@ rq_starved:
  * @q: request_queue to allocate request from
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLK_MQ_REQ_* flags.
  *
  * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
  * this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1349,7 +1351,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-		struct bio *bio, gfp_t gfp_mask)
+				   struct bio *bio, unsigned int flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1361,7 +1363,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, flags);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1370,7 +1372,7 @@ retry:
 		return ERR_PTR(-EAGAIN);
 	}
 
-	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
+	if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
 	}
@@ -1397,10 +1399,13 @@ retry:
 	goto retry;
 }
 
+/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, gfp_t gfp_mask)
+					   unsigned int op, unsigned int flags)
 {
 	struct request *rq;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+			 __GFP_DIRECT_RECLAIM;
 	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
@@ -1413,7 +1418,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, gfp_mask);
+	rq = get_request(q, op, NULL, flags);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		blk_queue_exit(q);
@@ -1427,25 +1432,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
+/**
+ * blk_get_request_flags - allocate a request
+ * @q: request queue to allocate a request for
+ * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
+ * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
+ */
+struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
+				      unsigned int flags)
 {
 	struct request *req;
 
+	WARN_ON_ONCE(op & REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+
 	if (q->mq_ops) {
-		req = blk_mq_alloc_request(q, op,
-			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
-				0 : BLK_MQ_REQ_NOWAIT);
+		req = blk_mq_alloc_request(q, op, flags);
 		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
 			q->mq_ops->initialize_rq_fn(req);
 	} else {
-		req = blk_old_get_request(q, op, gfp_mask);
+		req = blk_old_get_request(q, op, flags);
 		if (!IS_ERR(req) && q->initialize_rq_fn)
 			q->initialize_rq_fn(req);
 	}
 
 	return req;
 }
+EXPORT_SYMBOL(blk_get_request_flags);
+
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				gfp_t gfp_mask)
+{
+	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
+				     0 : BLK_MQ_REQ_NOWAIT);
+}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1871,7 +1891,7 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, 0);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1437ef4d80379..1af5ddd0f6318 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -927,6 +927,9 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
+extern struct request *blk_get_request_flags(struct request_queue *,
+					     unsigned int op,
+					     unsigned int flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 1b6d65a0bfb5df2a6029c1430e99fcc5d96bb59a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:55 -0800
Subject: block: Introduce BLK_MQ_REQ_PREEMPT

Set RQF_PREEMPT if BLK_MQ_REQ_PREEMPT is passed to
blk_get_request_flags().

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 4 +++-
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0f7093dfc0102..17eed16a6e040 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1263,6 +1263,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	blk_rq_set_rl(rq, rl);
 	rq->cmd_flags = op;
 	rq->rq_flags = rq_flags;
+	if (flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 
 	/* init elvpriv */
 	if (rq_flags & RQF_ELVPRIV) {
@@ -1444,7 +1446,7 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 	struct request *req;
 
 	WARN_ON_ONCE(op & REQ_NOWAIT);
-	WARN_ON_ONCE(flags & ~BLK_MQ_REQ_NOWAIT);
+	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
 
 	if (q->mq_ops) {
 		req = blk_mq_alloc_request(q, op, flags);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7173d4bd64afd..e21876778cecd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -291,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
 	rq->cmd_flags = op;
+	if (data->flags & BLK_MQ_REQ_PREEMPT)
+		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4ae987c2352cc..82b56609736ab 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -212,6 +212,7 @@ enum {
 	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
 	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
 	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
+	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-- 
cgit v1.2.3


From 039c635f4e666b647df2100038de276a83fb3fca Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:56 -0800
Subject: ide, scsi: Tell the block layer at request allocation time about
 preempt requests

Convert blk_get_request(q, op, __GFP_RECLAIM) into
blk_get_request_flags(q, op, BLK_MQ_PREEMPT). This patch does not
change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Acked-by: David S. Miller <davem@davemloft.net> [ for IDE ]
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-pm.c    | 4 ++--
 drivers/scsi/scsi_lib.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 544f02d673ca2..f56d742908df6 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -89,9 +89,9 @@ int generic_ide_resume(struct device *dev)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
+				   BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
-	rq->rq_flags |= RQF_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 286ea983c9e35..eb129dfc2ebef 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	struct scsi_request *rq;
 	int ret = DRIVER_ERROR << 24;
 
-	req = blk_get_request(sdev->request_queue,
+	req = blk_get_request_flags(sdev->request_queue,
 			data_direction == DMA_TO_DEVICE ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT);
 	if (IS_ERR(req))
 		return ret;
 	rq = scsi_req(req);
@@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	rq->retries = retries;
 	req->timeout = timeout;
 	req->cmd_flags |= flags;
-	req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
+	req->rq_flags |= rq_flags | RQF_QUIET;
 
 	/*
 	 * head injection *required* here otherwise quiesce won't work
-- 
cgit v1.2.3


From c9254f2ddb19387ea9714a57ea48463c20333b92 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:57 -0800
Subject: block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag

This flag will be used in the next patch to let the block layer
core know whether or not a SCSI request queue has been quiesced.
A quiesced SCSI queue namely only processes RQF_PREEMPT requests.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 30 ++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.c |  1 +
 include/linux/blkdev.h |  6 ++++++
 3 files changed, 37 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 17eed16a6e040..edc2768991165 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -348,6 +348,36 @@ void blk_sync_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
+/**
+ * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * @q: request queue pointer
+ *
+ * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+ * set and 1 if the flag was already set.
+ */
+int blk_set_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+	int res;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+
+void blk_clear_preempt_only(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+
 /**
  * __blk_run_queue_uncond - run a queue whether or not it has been stopped
  * @q:	The queue to run
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index bb7f084152033..e4f2bb936e663 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -74,6 +74,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(REGISTERED),
 	QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
 	QUEUE_FLAG_NAME(QUIESCED),
+	QUEUE_FLAG_NAME(PREEMPT_ONLY),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1af5ddd0f6318..2147e2381a22c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -632,6 +632,7 @@ struct request_queue {
 #define QUEUE_FLAG_REGISTERED  26	/* queue has been registered to a disk */
 #define QUEUE_FLAG_SCSI_PASSTHROUGH 27	/* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
+#define QUEUE_FLAG_PREEMPT_ONLY	29	/* only process REQ_PREEMPT requests */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
@@ -732,6 +733,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 #define blk_queue_quiesced(q)	test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
+#define blk_queue_preempt_only(q)				\
+	test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
+
+extern int blk_set_preempt_only(struct request_queue *q);
+extern void blk_clear_preempt_only(struct request_queue *q);
 
 static inline bool blk_account_rq(struct request *rq)
 {
-- 
cgit v1.2.3


From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:58 -0800
Subject: block, scsi: Make SCSI quiesce and resume work reliably

The contexts from which a SCSI device can be quiesced or resumed are:
* Writing into /sys/class/scsi_device/*/device/state.
* SCSI parallel (SPI) domain validation.
* The SCSI device power management methods. See also scsi_bus_pm_ops.

It is essential during suspend and resume that neither the filesystem
state nor the filesystem metadata in RAM changes. This is why while
the hibernation image is being written or restored that SCSI devices
are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
and scsi_device_resume(). In the SDEV_QUIESCE state execution of
non-preempt requests is deferred. This is realized by returning
BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
devices. Avoid that a full queue prevents power management requests
to be submitted by deferring allocation of non-preempt requests for
devices in the quiesced state. This patch has been tested by running
the following commands and by verifying that after each resume the
fio job was still running:

for ((i=0; i<10; i++)); do
  (
    cd /sys/block/md0/md &&
    while true; do
      [ "$(<sync_action)" = "idle" ] && echo check > sync_action
      sleep 1
    done
  ) &
  pids=($!)
  for d in /sys/class/block/sd*[a-z]; do
    bdev=${d#/sys/class/block/}
    hcil=$(readlink "$d/device")
    hcil=${hcil#../../../}
    echo 4 > "$d/queue/nr_requests"
    echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
    fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \
      --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16       \
      --iodepth_batch=1 --thread --loops=$((2**31)) &
    pids+=($!)
  done
  sleep 1
  echo "$(date) Hibernating ..." >>hibernate-test-log.txt
  systemctl hibernate
  sleep 10
  kill "${pids[@]}"
  echo idle > /sys/block/md0/md/sync_action
  wait
  echo "$(date) Done." >>hibernate-test-log.txt
done

Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Martin Steigerwald <martin@lichtvoll.de>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c           | 42 ++++++++++++++++++++++++++++++++++--------
 block/blk-mq.c             |  4 ++--
 drivers/scsi/scsi_lib.c    | 42 ++++++++++++++++++++++++++++++------------
 fs/block_dev.c             |  4 ++--
 include/linux/blkdev.h     |  2 +-
 include/scsi/scsi_device.h |  1 +
 6 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index edc2768991165..29b08428ae459 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	wake_up_all(&q->mq_freeze_wq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
@@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, unsigned int flags)
 {
+	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
 	while (true) {
+		bool success = false;
 		int ret;
 
-		if (percpu_ref_tryget_live(&q->q_usage_counter))
+		rcu_read_lock_sched();
+		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+			/*
+			 * The code that sets the PREEMPT_ONLY flag is
+			 * responsible for ensuring that that flag is globally
+			 * visible before the queue is unfrozen.
+			 */
+			if (preempt || !blk_queue_preempt_only(q)) {
+				success = true;
+			} else {
+				percpu_ref_put(&q->q_usage_counter);
+			}
+		}
+		rcu_read_unlock_sched();
+
+		if (success)
 			return 0;
 
-		if (nowait)
+		if (flags & BLK_MQ_REQ_NOWAIT)
 			return -EBUSY;
 
 		/*
@@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
 		smp_rmb();
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!atomic_read(&q->mq_freeze_depth) ||
+				(atomic_read(&q->mq_freeze_depth) == 0 &&
+				 (preempt || !blk_queue_preempt_only(q))) ||
 				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
@@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
-	ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
-			      (op & REQ_NOWAIT));
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
@@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
+		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+			BLK_MQ_REQ_NOWAIT : 0;
 
-		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+		if (likely(blk_queue_enter(q, flags) == 0)) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
@@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
 	if (!generic_make_request_checks(bio))
 		return BLK_QC_T_NONE;
 
-	if (unlikely(blk_queue_enter(q, nowait))) {
+	if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
 		if (nowait && !blk_queue_dying(q))
 			bio->bi_status = BLK_STS_AGAIN;
 		else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e21876778cecd..211bc8a3e2cc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	struct request *rq;
 	int ret;
 
-	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (hctx_idx >= q->nr_hw_queues)
 		return ERR_PTR(-EIO);
 
-	ret = blk_queue_enter(q, true);
+	ret = blk_queue_enter(q, flags);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index eb129dfc2ebef..f907e2f8c1ddb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
 int
 scsi_device_quiesce(struct scsi_device *sdev)
 {
+	struct request_queue *q = sdev->request_queue;
 	int err;
 
+	/*
+	 * It is allowed to call scsi_device_quiesce() multiple times from
+	 * the same context but concurrent scsi_device_quiesce() calls are
+	 * not allowed.
+	 */
+	WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
+
+	blk_set_preempt_only(q);
+
+	blk_mq_freeze_queue(q);
+	/*
+	 * Ensure that the effect of blk_set_preempt_only() will be visible
+	 * for percpu_ref_tryget() callers that occur after the queue
+	 * unfreeze even if the queue was already frozen before this function
+	 * was called. See also https://lwn.net/Articles/573497/.
+	 */
+	synchronize_rcu();
+	blk_mq_unfreeze_queue(q);
+
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_device_set_state(sdev, SDEV_QUIESCE);
+	if (err == 0)
+		sdev->quiesced_by = current;
+	else
+		blk_clear_preempt_only(q);
 	mutex_unlock(&sdev->state_mutex);
 
-	if (err)
-		return err;
-
-	scsi_run_queue(sdev->request_queue);
-	while (atomic_read(&sdev->device_busy)) {
-		msleep_interruptible(200);
-		scsi_run_queue(sdev->request_queue);
-	}
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(scsi_device_quiesce);
 
@@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
 	 * device deleted during suspend)
 	 */
 	mutex_lock(&sdev->state_mutex);
-	if (sdev->sdev_state == SDEV_QUIESCE &&
-	    scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
-		scsi_run_queue(sdev->request_queue);
+	WARN_ON_ONCE(!sdev->quiesced_by);
+	sdev->quiesced_by = NULL;
+	blk_clear_preempt_only(sdev->request_queue);
+	if (sdev->sdev_state == SDEV_QUIESCE)
+		scsi_device_set_state(sdev, SDEV_RUNNING);
 	mutex_unlock(&sdev->state_mutex);
 }
 EXPORT_SYMBOL(scsi_device_resume);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4afa4d5ff969e..04973f4844224 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2147e2381a22c..402c9d536ae10 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, bool nowait);
+extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 82e93ee94708c..6f0f1e242e236 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -219,6 +219,7 @@ struct scsi_device {
 	unsigned char		access_state;
 	struct mutex		state_mutex;
 	enum scsi_device_state sdev_state;
+	struct task_struct	*quiesced_by;
 	unsigned long		sdev_data[0];
 } __attribute__((aligned(sizeof(unsigned long))));
 
-- 
cgit v1.2.3


From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 9 Nov 2017 10:49:59 -0800
Subject: block, nvme: Introduce blk_mq_req_flags_t

Several block layer and NVMe core functions accept a combination
of BLK_MQ_REQ_* flags through the 'flags' argument but there is
no verification at compile time whether the right type of block
layer flags is passed. Make it possible for sparse to verify this.
This patch does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Cc: linux-nvme@lists.infradead.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          | 12 ++++++------
 block/blk-mq.c            |  4 ++--
 block/blk-mq.h            |  2 +-
 drivers/nvme/host/core.c  |  5 +++--
 drivers/nvme/host/nvme.h  |  5 +++--
 include/linux/blk-mq.h    | 17 +++++++++++------
 include/linux/blk_types.h |  2 ++
 include/linux/blkdev.h    |  4 ++--
 8 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 29b08428ae459..7c54c195e79e8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -801,7 +801,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
  * @q: request queue pointer
  * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
  */
-int blk_queue_enter(struct request_queue *q, unsigned int flags)
+int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
 	const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
 
@@ -1225,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-				     struct bio *bio, unsigned int flags)
+				     struct bio *bio, blk_mq_req_flags_t flags)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1408,7 +1408,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-				   struct bio *bio, unsigned int flags)
+				   struct bio *bio, blk_mq_req_flags_t flags)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1458,7 +1458,7 @@ retry:
 
 /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
 static struct request *blk_old_get_request(struct request_queue *q,
-					   unsigned int op, unsigned int flags)
+				unsigned int op, blk_mq_req_flags_t flags)
 {
 	struct request *rq;
 	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
@@ -1495,7 +1495,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
 struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      unsigned int flags)
+				      blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -2291,7 +2291,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = bio_list_on_stack;
 	do {
 		struct request_queue *q = bio->bi_disk->queue;
-		unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
+		blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
 			BLK_MQ_REQ_NOWAIT : 0;
 
 		if (likely(blk_queue_enter(q, flags) == 0)) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 211bc8a3e2cc4..bfe24a5b62a36 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -383,7 +383,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags)
+		blk_mq_req_flags_t flags)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
@@ -409,7 +409,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx)
+	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2502f40ccdc07..99a19c5523e2b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -111,7 +111,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	unsigned int flags;
+	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
 
 	/* input & output parameter */
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 683d890d73fa3..878d5c09d15b2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns)
 }
 
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid)
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
 	unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
 	struct request *req;
@@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
  */
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags)
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags)
 {
 	struct request *req;
 	int ret;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7b9cc7d616b75..23b8504ace7fa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set);
 
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
-		struct nvme_command *cmd, unsigned int flags, int qid);
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmd);
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		union nvme_result *result, void *buffer, unsigned bufflen,
-		unsigned timeout, int qid, int at_head, int flags);
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 82b56609736ab..b326208277ee4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -209,16 +209,21 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
-	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
-	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
-	BLK_MQ_REQ_PREEMPT	= (1 << 3), /* set RQF_PREEMPT */
+	/* return when out of requests */
+	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
+	/* allocate from reserved pool */
+	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
+	/* allocate internal/sched tag */
+	BLK_MQ_REQ_INTERNAL	= (__force blk_mq_req_flags_t)(1 << 2),
+	/* set RQF_PREEMPT */
+	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
-		unsigned int flags);
+		blk_mq_req_flags_t flags);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
-		unsigned int op, unsigned int flags, unsigned int hctx_idx);
+		unsigned int op, blk_mq_req_flags_t flags,
+		unsigned int hctx_idx);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 enum {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b04085255fb3..13ccfc9b210ac 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -162,6 +162,8 @@ struct bio {
  */
 #define BIO_RESET_BITS	BVEC_POOL_OFFSET
 
+typedef __u32 __bitwise blk_mq_req_flags_t;
+
 /*
  * Operations and flags common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 402c9d536ae10..e80ea1d31343d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -935,7 +935,7 @@ extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request_flags(struct request_queue *,
 					     unsigned int op,
-					     unsigned int flags);
+					     blk_mq_req_flags_t flags);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       gfp_t gfp_mask);
 extern void blk_requeue_request(struct request_queue *, struct request *);
@@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
+extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_start_queue_async(struct request_queue *q);
-- 
cgit v1.2.3


From ab9e00cc72fa4c6eb6370757b9e94c4645e91d5d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 13:48:55 +0100
Subject: nvme: track subsystems

This adds a new nvme_subsystem structure so that we can track multiple
controllers that belong to a single subsystem.  For now we only use it
to store the NQN, and to check that we don't have duplicate NQNs unless
the involved subsystems support multiple controllers.

Includes code originally from Hannes Reinecke to expose the subsystems
in sysfs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c    | 210 ++++++++++++++++++++++++++++++++++++++------
 drivers/nvme/host/fabrics.c |   4 +-
 drivers/nvme/host/nvme.h    |  27 ++++--
 3 files changed, 205 insertions(+), 36 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 878d5c09d15b2..78dc6f624d52f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,9 +68,14 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
 struct workqueue_struct *nvme_wq;
 EXPORT_SYMBOL_GPL(nvme_wq);
 
+static DEFINE_IDA(nvme_subsystems_ida);
+static LIST_HEAD(nvme_subsystems);
+static DEFINE_MUTEX(nvme_subsystems_lock);
+
 static DEFINE_IDA(nvme_instance_ida);
 static dev_t nvme_chr_devt;
 static struct class *nvme_class;
+static struct class *nvme_subsys_class;
 
 static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
@@ -1804,14 +1809,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id,
 		string_matches(id->fr, q->fr, sizeof(id->fr));
 }
 
-static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
+		struct nvme_id_ctrl *id)
 {
 	size_t nqnlen;
 	int off;
 
 	nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
 	if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
-		strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE);
+		strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
 		return;
 	}
 
@@ -1819,14 +1825,140 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 		dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
 
 	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
-	off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE,
+	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
 			"nqn.2014.08.org.nvmexpress:%4x%4x",
 			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
-	memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn));
+	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
 	off += sizeof(id->sn);
-	memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn));
+	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
 	off += sizeof(id->mn);
-	memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off);
+	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
+}
+
+static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
+{
+	ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
+	kfree(subsys);
+}
+
+static void nvme_release_subsystem(struct device *dev)
+{
+	__nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
+}
+
+static void nvme_destroy_subsystem(struct kref *ref)
+{
+	struct nvme_subsystem *subsys =
+			container_of(ref, struct nvme_subsystem, ref);
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_del(&subsys->entry);
+	mutex_unlock(&nvme_subsystems_lock);
+
+	device_del(&subsys->dev);
+	put_device(&subsys->dev);
+}
+
+static void nvme_put_subsystem(struct nvme_subsystem *subsys)
+{
+	kref_put(&subsys->ref, nvme_destroy_subsystem);
+}
+
+static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
+{
+	struct nvme_subsystem *subsys;
+
+	lockdep_assert_held(&nvme_subsystems_lock);
+
+	list_for_each_entry(subsys, &nvme_subsystems, entry) {
+		if (strcmp(subsys->subnqn, subsysnqn))
+			continue;
+		if (!kref_get_unless_zero(&subsys->ref))
+			continue;
+		return subsys;
+	}
+
+	return NULL;
+}
+
+static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+	struct nvme_subsystem *subsys, *found;
+	int ret;
+
+	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+	if (!subsys)
+		return -ENOMEM;
+	ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0) {
+		kfree(subsys);
+		return ret;
+	}
+	subsys->instance = ret;
+	mutex_init(&subsys->lock);
+	kref_init(&subsys->ref);
+	INIT_LIST_HEAD(&subsys->ctrls);
+	nvme_init_subnqn(subsys, ctrl, id);
+	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
+	memcpy(subsys->model, id->mn, sizeof(subsys->model));
+	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
+	subsys->vendor_id = le16_to_cpu(id->vid);
+	subsys->cmic = id->cmic;
+
+	subsys->dev.class = nvme_subsys_class;
+	subsys->dev.release = nvme_release_subsystem;
+	dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+	device_initialize(&subsys->dev);
+
+	mutex_lock(&nvme_subsystems_lock);
+	found = __nvme_find_get_subsystem(subsys->subnqn);
+	if (found) {
+		/*
+		 * Verify that the subsystem actually supports multiple
+		 * controllers, else bail out.
+		 */
+		if (!(id->cmic & (1 << 1))) {
+			dev_err(ctrl->device,
+				"ignoring ctrl due to duplicate subnqn (%s).\n",
+				found->subnqn);
+			nvme_put_subsystem(found);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		__nvme_release_subsystem(subsys);
+		subsys = found;
+	} else {
+		ret = device_add(&subsys->dev);
+		if (ret) {
+			dev_err(ctrl->device,
+				"failed to register subsystem device.\n");
+			goto out_unlock;
+		}
+		list_add_tail(&subsys->entry, &nvme_subsystems);
+	}
+
+	ctrl->subsys = subsys;
+	mutex_unlock(&nvme_subsystems_lock);
+
+	if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+			dev_name(ctrl->device))) {
+		dev_err(ctrl->device,
+			"failed to create sysfs link from subsystem.\n");
+		/* the transport driver will eventually put the subsystem */
+		return -EINVAL;
+	}
+
+	mutex_lock(&subsys->lock);
+	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+	mutex_unlock(&subsys->lock);
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&nvme_subsystems_lock);
+	put_device(&subsys->dev);
+	return ret;
 }
 
 static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
@@ -1901,9 +2033,13 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 			return ret;
 	}
 
-	nvme_init_subnqn(ctrl, id);
-
 	if (!ctrl->identified) {
+		int i;
+
+		ret = nvme_init_subsystem(ctrl, id);
+		if (ret)
+			goto out_free;
+
 		/*
 		 * Check for quirks.  Quirk can depend on firmware version,
 		 * so, in principle, the set of quirks present can change
@@ -1912,9 +2048,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		 * the device, but we'd have to make sure that the driver
 		 * behaves intelligently if the quirks change.
 		 */
-
-		int i;
-
 		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
 			if (quirk_matches(id, &core_quirks[i]))
 				ctrl->quirks |= core_quirks[i].quirks;
@@ -1927,14 +2060,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	}
 
 	ctrl->oacs = le16_to_cpu(id->oacs);
-	ctrl->vid = le16_to_cpu(id->vid);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	ctrl->cntlid = le16_to_cpup(&id->cntlid);
-	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
-	memcpy(ctrl->model, id->mn, sizeof(id->mn));
-	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
 	if (id->mdts)
 		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
 	else
@@ -2137,9 +2266,9 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	int serial_len = sizeof(ctrl->serial);
-	int model_len = sizeof(ctrl->model);
+	struct nvme_subsystem *subsys = ns->ctrl->subsys;
+	int serial_len = sizeof(subsys->serial);
+	int model_len = sizeof(subsys->model);
 
 	if (!uuid_is_null(&ns->uuid))
 		return sprintf(buf, "uuid.%pU\n", &ns->uuid);
@@ -2150,15 +2279,16 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 	if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
 		return sprintf(buf, "eui.%8phN\n", ns->eui);
 
-	while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' ||
-				  ctrl->serial[serial_len - 1] == '\0'))
+	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
+				  subsys->serial[serial_len - 1] == '\0'))
 		serial_len--;
-	while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' ||
-				 ctrl->model[model_len - 1] == '\0'))
+	while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
+				 subsys->model[model_len - 1] == '\0'))
 		model_len--;
 
-	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid,
-		serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id);
+	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
+		serial_len, subsys->serial, model_len, subsys->model,
+		ns->ns_id);
 }
 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
 
@@ -2244,10 +2374,15 @@ static ssize_t  field##_show(struct device *dev,				\
 			    struct device_attribute *attr, char *buf)		\
 {										\
         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
-        return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field);	\
+        return sprintf(buf, "%.*s\n",						\
+		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\
 }										\
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
+nvme_show_str_function(model);
+nvme_show_str_function(serial);
+nvme_show_str_function(firmware_rev);
+
 #define nvme_show_int_function(field)						\
 static ssize_t  field##_show(struct device *dev,				\
 			    struct device_attribute *attr, char *buf)		\
@@ -2257,9 +2392,6 @@ static ssize_t  field##_show(struct device *dev,				\
 }										\
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
-nvme_show_str_function(model);
-nvme_show_str_function(serial);
-nvme_show_str_function(firmware_rev);
 nvme_show_int_function(cntlid);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
@@ -2313,7 +2445,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
 {
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn);
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
 }
 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
 
@@ -2817,12 +2949,23 @@ static void nvme_free_ctrl(struct device *dev)
 {
 	struct nvme_ctrl *ctrl =
 		container_of(dev, struct nvme_ctrl, ctrl_device);
+	struct nvme_subsystem *subsys = ctrl->subsys;
 
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	ida_destroy(&ctrl->ns_ida);
 	kfree(ctrl->effects);
 
+	if (subsys) {
+		mutex_lock(&subsys->lock);
+		list_del(&ctrl->subsys_entry);
+		mutex_unlock(&subsys->lock);
+		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+	}
+
 	ctrl->ops->free_ctrl(ctrl);
+
+	if (subsys)
+		nvme_put_subsystem(subsys);
 }
 
 /*
@@ -3022,8 +3165,15 @@ int __init nvme_core_init(void)
 		goto unregister_chrdev;
 	}
 
+	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+	if (IS_ERR(nvme_subsys_class)) {
+		result = PTR_ERR(nvme_subsys_class);
+		goto destroy_class;
+	}
 	return 0;
 
+destroy_class:
+	class_destroy(nvme_class);
 unregister_chrdev:
 	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
 destroy_wq:
@@ -3033,6 +3183,8 @@ destroy_wq:
 
 void nvme_core_exit(void)
 {
+	ida_destroy(&nvme_subsystems_ida);
+	class_destroy(nvme_subsys_class);
 	class_destroy(nvme_class);
 	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
 	destroy_workqueue(nvme_wq);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index a4967de5bb255..76b4fe6816a03 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -882,10 +882,10 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 		goto out_unlock;
 	}
 
-	if (strcmp(ctrl->subnqn, opts->subsysnqn)) {
+	if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
 		dev_warn(ctrl->device,
 			"controller returned incorrect NQN: \"%s\".\n",
-			ctrl->subnqn);
+			ctrl->subsys->subnqn);
 		up_read(&nvmf_transports_rwsem);
 		nvme_delete_ctrl_sync(ctrl);
 		return ERR_PTR(-EINVAL);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 23b8504ace7fa..6ae50979c3aa0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -140,13 +140,12 @@ struct nvme_ctrl {
 	struct work_struct reset_work;
 	struct work_struct delete_work;
 
+	struct nvme_subsystem *subsys;
+	struct list_head subsys_entry;
+
 	struct opal_dev *opal_dev;
 
 	char name[12];
-	char serial[20];
-	char model[40];
-	char firmware_rev[8];
-	char subnqn[NVMF_NQN_SIZE];
 	u16 cntlid;
 
 	u32 ctrl_config;
@@ -157,7 +156,6 @@ struct nvme_ctrl {
 	u32 page_size;
 	u32 max_hw_sectors;
 	u16 oncs;
-	u16 vid;
 	u16 oacs;
 	u16 nssa;
 	u16 nr_streams;
@@ -200,6 +198,25 @@ struct nvme_ctrl {
 	struct nvmf_ctrl_options *opts;
 };
 
+struct nvme_subsystem {
+	int			instance;
+	struct device		dev;
+	/*
+	 * Because we unregister the device on the last put we need
+	 * a separate refcount.
+	 */
+	struct kref		ref;
+	struct list_head	entry;
+	struct mutex		lock;
+	struct list_head	ctrls;
+	char			subnqn[NVMF_NQN_SIZE];
+	char			serial[20];
+	char			model[40];
+	char			firmware_rev[8];
+	u8			cmic;
+	u16			vendor_id;
+};
+
 struct nvme_ns {
 	struct list_head list;
 
-- 
cgit v1.2.3


From 002fab040468f5b279f8d8d49d487e73b0c3f98a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 13:50:16 +0100
Subject: nvme: introduce a nvme_ns_ids structure

This allows us to manage the various uniqueue namespace identifiers
together instead needing various variables and arguments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 69 +++++++++++++++++++++++++++---------------------
 drivers/nvme/host/nvme.h | 14 +++++++---
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 78dc6f624d52f..04c0949c754ac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -797,7 +797,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 }
 
 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
-		u8 *eui64, u8 *nguid, uuid_t *uuid)
+		struct nvme_ns_ids *ids)
 {
 	struct nvme_command c = { };
 	int status;
@@ -833,7 +833,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 				goto free_data;
 			}
 			len = NVME_NIDT_EUI64_LEN;
-			memcpy(eui64, data + pos + sizeof(*cur), len);
+			memcpy(ids->eui64, data + pos + sizeof(*cur), len);
 			break;
 		case NVME_NIDT_NGUID:
 			if (cur->nidl != NVME_NIDT_NGUID_LEN) {
@@ -843,7 +843,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 				goto free_data;
 			}
 			len = NVME_NIDT_NGUID_LEN;
-			memcpy(nguid, data + pos + sizeof(*cur), len);
+			memcpy(ids->nguid, data + pos + sizeof(*cur), len);
 			break;
 		case NVME_NIDT_UUID:
 			if (cur->nidl != NVME_NIDT_UUID_LEN) {
@@ -853,7 +853,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 				goto free_data;
 			}
 			len = NVME_NIDT_UUID_LEN;
-			uuid_copy(uuid, data + pos + sizeof(*cur));
+			uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
 			break;
 		default:
 			/* Skip unnkown types */
@@ -1233,22 +1233,31 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
 }
 
 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
-		struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
+		struct nvme_id_ns *id, struct nvme_ns_ids *ids)
 {
+	memset(ids, 0, sizeof(*ids));
+
 	if (ctrl->vs >= NVME_VS(1, 1, 0))
-		memcpy(eui64, id->eui64, sizeof(id->eui64));
+		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
 	if (ctrl->vs >= NVME_VS(1, 2, 0))
-		memcpy(nguid, id->nguid, sizeof(id->nguid));
+		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
 	if (ctrl->vs >= NVME_VS(1, 3, 0)) {
 		 /* Don't treat error as fatal we potentially
 		  * already have a NGUID or EUI-64
 		  */
-		if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
+		if (nvme_identify_ns_descs(ctrl, nsid, ids))
 			dev_warn(ctrl->device,
 				 "%s: Identify Descriptors failed\n", __func__);
 	}
 }
 
+static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+{
+	return uuid_equal(&a->uuid, &b->uuid) &&
+		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+}
+
 static void nvme_update_disk_info(struct gendisk *disk,
 		struct nvme_ns *ns, struct nvme_id_ns *id)
 {
@@ -1304,8 +1313,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_id_ns *id;
-	u8 eui64[8] = { 0 }, nguid[16] = { 0 };
-	uuid_t uuid = uuid_null;
+	struct nvme_ns_ids ids;
 	int ret = 0;
 
 	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
@@ -1322,10 +1330,8 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		goto out;
 	}
 
-	nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
-	if (!uuid_equal(&ns->uuid, &uuid) ||
-	    memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
-	    memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
+	nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids);
+	if (!nvme_ns_ids_equal(&ns->ids, &ids)) {
 		dev_err(ctrl->device,
 			"identifiers changed for nsid %d\n", ns->ns_id);
 		ret = -ENODEV;
@@ -2266,18 +2272,19 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvme_ns_ids *ids = &ns->ids;
 	struct nvme_subsystem *subsys = ns->ctrl->subsys;
 	int serial_len = sizeof(subsys->serial);
 	int model_len = sizeof(subsys->model);
 
-	if (!uuid_is_null(&ns->uuid))
-		return sprintf(buf, "uuid.%pU\n", &ns->uuid);
+	if (!uuid_is_null(&ids->uuid))
+		return sprintf(buf, "uuid.%pU\n", &ids->uuid);
 
-	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
-		return sprintf(buf, "eui.%16phN\n", ns->nguid);
+	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+		return sprintf(buf, "eui.%16phN\n", ids->nguid);
 
-	if (memchr_inv(ns->eui, 0, sizeof(ns->eui)))
-		return sprintf(buf, "eui.%8phN\n", ns->eui);
+	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+		return sprintf(buf, "eui.%8phN\n", ids->eui64);
 
 	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
 				  subsys->serial[serial_len - 1] == '\0'))
@@ -2296,7 +2303,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%pU\n", ns->nguid);
+	return sprintf(buf, "%pU\n", ns->ids.nguid);
 }
 static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
 
@@ -2304,16 +2311,17 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvme_ns_ids *ids = &ns->ids;
 
 	/* For backward compatibility expose the NGUID to userspace if
 	 * we have no UUID set
 	 */
-	if (uuid_is_null(&ns->uuid)) {
+	if (uuid_is_null(&ids->uuid)) {
 		printk_ratelimited(KERN_WARNING
 				   "No UUID available providing old NGUID\n");
-		return sprintf(buf, "%pU\n", ns->nguid);
+		return sprintf(buf, "%pU\n", ids->nguid);
 	}
-	return sprintf(buf, "%pU\n", &ns->uuid);
+	return sprintf(buf, "%pU\n", &ids->uuid);
 }
 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
 
@@ -2321,7 +2329,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%8ph\n", ns->eui);
+	return sprintf(buf, "%8ph\n", ns->ids.eui64);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 
@@ -2347,18 +2355,19 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvme_ns_ids *ids = &ns->ids;
 
 	if (a == &dev_attr_uuid.attr) {
-		if (uuid_is_null(&ns->uuid) ||
-		    !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+		if (uuid_is_null(&ids->uuid) ||
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
 			return 0;
 	}
 	if (a == &dev_attr_nguid.attr) {
-		if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
+		if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
 			return 0;
 	}
 	if (a == &dev_attr_eui.attr) {
-		if (!memchr_inv(ns->eui, 0, sizeof(ns->eui)))
+		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
 			return 0;
 	}
 	return a->mode;
@@ -2591,7 +2600,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (id->ncap == 0)
 		goto out_free_id;
 
-	nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
+	nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids);
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		if (nvme_nvm_register(ns, disk_name, node)) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6ae50979c3aa0..2f1af915f93a5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -217,6 +217,15 @@ struct nvme_subsystem {
 	u16			vendor_id;
 };
 
+/*
+ * Container structure for uniqueue namespace identifiers.
+ */
+struct nvme_ns_ids {
+	u8	eui64[8];
+	u8	nguid[16];
+	uuid_t	uuid;
+};
+
 struct nvme_ns {
 	struct list_head list;
 
@@ -227,11 +236,8 @@ struct nvme_ns {
 	struct kref kref;
 	int instance;
 
-	u8 eui[8];
-	u8 nguid[16];
-	uuid_t uuid;
-
 	unsigned ns_id;
+	struct nvme_ns_ids ids;
 	int lba_shift;
 	u16 ms;
 	u16 sgs;
-- 
cgit v1.2.3


From ed754e5deeb17f4e675c84e4b6c640cc7344e498 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 13:50:43 +0100
Subject: nvme: track shared namespaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new struct nvme_ns_head that holds information about an actual
namespace, unlike struct nvme_ns, which only holds the per-controller
namespace information.  For private namespaces there is a 1:1 relation of
the two, but for shared namespaces this lets us discover all the paths to
it.  For now only the identifiers are moved to the new structure, but most
of the information in struct nvme_ns should eventually move over.

To allow lockless path lookup the list of nvme_ns structures per
nvme_ns_head is protected by SRCU, which requires freeing the nvme_ns
structure through call_srcu.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c     | 220 +++++++++++++++++++++++++++++++++++--------
 drivers/nvme/host/lightnvm.c |  14 +--
 drivers/nvme/host/nvme.h     |  26 ++++-
 3 files changed, 210 insertions(+), 50 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 04c0949c754ac..13676f6cd4f67 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -291,6 +291,22 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 
+static void nvme_free_ns_head(struct kref *ref)
+{
+	struct nvme_ns_head *head =
+		container_of(ref, struct nvme_ns_head, ref);
+
+	ida_simple_remove(&head->subsys->ns_ida, head->instance);
+	list_del_init(&head->entry);
+	cleanup_srcu_struct(&head->srcu);
+	kfree(head);
+}
+
+static void nvme_put_ns_head(struct nvme_ns_head *head)
+{
+	kref_put(&head->ref, nvme_free_ns_head);
+}
+
 static void nvme_free_ns(struct kref *kref)
 {
 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
@@ -299,7 +315,7 @@ static void nvme_free_ns(struct kref *kref)
 		nvme_nvm_unregister(ns);
 
 	put_disk(ns->disk);
-	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+	nvme_put_ns_head(ns->head);
 	nvme_put_ctrl(ns->ctrl);
 	kfree(ns);
 }
@@ -435,7 +451,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
 {
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->common.opcode = nvme_cmd_flush;
-	cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
 
 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
@@ -466,7 +482,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
-	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
@@ -495,7 +511,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
-	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 
@@ -987,7 +1003,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	memset(&c, 0, sizeof(c));
 	c.rw.opcode = io.opcode;
 	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(ns->ns_id);
+	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
 	c.rw.slba = cpu_to_le64(io.slba);
 	c.rw.length = cpu_to_le16(io.nblocks);
 	c.rw.control = cpu_to_le16(io.control);
@@ -1130,7 +1146,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	switch (cmd) {
 	case NVME_IOCTL_ID:
 		force_successful_syscall_return();
-		return ns->ns_id;
+		return ns->head->ns_id;
 	case NVME_IOCTL_ADMIN_CMD:
 		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
 	case NVME_IOCTL_IO_CMD:
@@ -1251,6 +1267,13 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
 	}
 }
 
+static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+{
+	return !uuid_is_null(&ids->uuid) ||
+		memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+		memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+}
+
 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 {
 	return uuid_equal(&a->uuid, &b->uuid) &&
@@ -1321,7 +1344,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		return -ENODEV;
 	}
 
-	id = nvme_identify_ns(ctrl, ns->ns_id);
+	id = nvme_identify_ns(ctrl, ns->head->ns_id);
 	if (!id)
 		return -ENODEV;
 
@@ -1330,10 +1353,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		goto out;
 	}
 
-	nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids);
-	if (!nvme_ns_ids_equal(&ns->ids, &ids)) {
+	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
 		dev_err(ctrl->device,
-			"identifiers changed for nsid %d\n", ns->ns_id);
+			"identifiers changed for nsid %d\n", ns->head->ns_id);
 		ret = -ENODEV;
 	}
 
@@ -1374,7 +1397,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = op;
-	c.common.nsid = cpu_to_le32(ns->ns_id);
+	c.common.nsid = cpu_to_le32(ns->head->ns_id);
 	c.common.cdw10[0] = cpu_to_le32(cdw10);
 
 	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
@@ -1861,6 +1884,7 @@ static void nvme_destroy_subsystem(struct kref *ref)
 	list_del(&subsys->entry);
 	mutex_unlock(&nvme_subsystems_lock);
 
+	ida_destroy(&subsys->ns_ida);
 	device_del(&subsys->dev);
 	put_device(&subsys->dev);
 }
@@ -1904,6 +1928,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	mutex_init(&subsys->lock);
 	kref_init(&subsys->ref);
 	INIT_LIST_HEAD(&subsys->ctrls);
+	INIT_LIST_HEAD(&subsys->nsheads);
 	nvme_init_subnqn(subsys, ctrl, id);
 	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
 	memcpy(subsys->model, id->mn, sizeof(subsys->model));
@@ -1941,6 +1966,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 				"failed to register subsystem device.\n");
 			goto out_unlock;
 		}
+		ida_init(&subsys->ns_ida);
 		list_add_tail(&subsys->entry, &nvme_subsystems);
 	}
 
@@ -2272,7 +2298,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->ids;
+	struct nvme_ns_ids *ids = &ns->head->ids;
 	struct nvme_subsystem *subsys = ns->ctrl->subsys;
 	int serial_len = sizeof(subsys->serial);
 	int model_len = sizeof(subsys->model);
@@ -2295,7 +2321,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 
 	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
 		serial_len, subsys->serial, model_len, subsys->model,
-		ns->ns_id);
+		ns->head->ns_id);
 }
 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
 
@@ -2303,7 +2329,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%pU\n", ns->ids.nguid);
+	return sprintf(buf, "%pU\n", ns->head->ids.nguid);
 }
 static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
 
@@ -2311,7 +2337,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->ids;
+	struct nvme_ns_ids *ids = &ns->head->ids;
 
 	/* For backward compatibility expose the NGUID to userspace if
 	 * we have no UUID set
@@ -2329,7 +2355,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%8ph\n", ns->ids.eui64);
+	return sprintf(buf, "%8ph\n", ns->head->ids.eui64);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 
@@ -2337,7 +2363,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%d\n", ns->ns_id);
+	return sprintf(buf, "%d\n", ns->head->ns_id);
 }
 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
 
@@ -2355,7 +2381,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->ids;
+	struct nvme_ns_ids *ids = &ns->head->ids;
 
 	if (a == &dev_attr_uuid.attr) {
 		if (uuid_is_null(&ids->uuid) ||
@@ -2507,12 +2533,124 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
 	NULL,
 };
 
+static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
+		unsigned nsid)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
+			return h;
+	}
+
+	return NULL;
+}
+
+static int __nvme_check_ids(struct nvme_subsystem *subsys,
+		struct nvme_ns_head *new)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (nvme_ns_ids_valid(&new->ids) &&
+		    nvme_ns_ids_equal(&new->ids, &h->ids))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
+		unsigned nsid, struct nvme_id_ns *id)
+{
+	struct nvme_ns_head *head;
+	int ret = -ENOMEM;
+
+	head = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (!head)
+		goto out;
+	ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_free_head;
+	head->instance = ret;
+	INIT_LIST_HEAD(&head->list);
+	init_srcu_struct(&head->srcu);
+	head->subsys = ctrl->subsys;
+	head->ns_id = nsid;
+	kref_init(&head->ref);
+
+	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+
+	ret = __nvme_check_ids(ctrl->subsys, head);
+	if (ret) {
+		dev_err(ctrl->device,
+			"duplicate IDs for nsid %d\n", nsid);
+		goto out_cleanup_srcu;
+	}
+
+	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
+	return head;
+out_cleanup_srcu:
+	cleanup_srcu_struct(&head->srcu);
+	ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
+out_free_head:
+	kfree(head);
+out:
+	return ERR_PTR(ret);
+}
+
+static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
+		struct nvme_id_ns *id, bool *new)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	bool is_shared = id->nmic & (1 << 0);
+	struct nvme_ns_head *head = NULL;
+	int ret = 0;
+
+	mutex_lock(&ctrl->subsys->lock);
+	if (is_shared)
+		head = __nvme_find_ns_head(ctrl->subsys, nsid);
+	if (!head) {
+		head = nvme_alloc_ns_head(ctrl, nsid, id);
+		if (IS_ERR(head)) {
+			ret = PTR_ERR(head);
+			goto out_unlock;
+		}
+
+		*new = true;
+	} else {
+		struct nvme_ns_ids ids;
+
+		nvme_report_ns_ids(ctrl, nsid, id, &ids);
+		if (!nvme_ns_ids_equal(&head->ids, &ids)) {
+			dev_err(ctrl->device,
+				"IDs don't match for shared namespace %d\n",
+					nsid);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		*new = false;
+	}
+
+	list_add_tail(&ns->siblings, &head->list);
+	ns->head = head;
+
+out_unlock:
+	mutex_unlock(&ctrl->subsys->lock);
+	return ret;
+}
+
 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
 
-	return nsa->ns_id - nsb->ns_id;
+	return nsa->head->ns_id - nsb->head->ns_id;
 }
 
 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
@@ -2521,13 +2659,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
-		if (ns->ns_id == nsid) {
+		if (ns->head->ns_id == nsid) {
 			if (!kref_get_unless_zero(&ns->kref))
 				continue;
 			ret = ns;
 			break;
 		}
-		if (ns->ns_id > nsid)
+		if (ns->head->ns_id > nsid)
 			break;
 	}
 	mutex_unlock(&ctrl->namespaces_mutex);
@@ -2542,7 +2680,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
 	if (!ctrl->nr_streams)
 		return 0;
 
-	ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+	ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
 	if (ret)
 		return ret;
 
@@ -2567,32 +2705,26 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
 	int node = dev_to_node(ctrl->dev);
+	bool new = true;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
 		return;
 
-	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
-	if (ns->instance < 0)
-		goto out_free_ns;
-
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	if (IS_ERR(ns->queue))
-		goto out_release_instance;
+		goto out_free_ns;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
 
 	kref_init(&ns->kref);
-	ns->ns_id = nsid;
 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
 
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
 	nvme_setup_streams_ns(ctrl, ns);
 
-	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
-
 	id = nvme_identify_ns(ctrl, nsid);
 	if (!id)
 		goto out_free_queue;
@@ -2600,18 +2732,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (id->ncap == 0)
 		goto out_free_id;
 
-	nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids);
+	if (nvme_init_ns_head(ns, nsid, id, &new))
+		goto out_free_id;
+	
+	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		if (nvme_nvm_register(ns, disk_name, node)) {
 			dev_warn(ctrl->device, "LightNVM init failure\n");
-			goto out_free_id;
+			goto out_unlink_ns;
 		}
 	}
 
 	disk = alloc_disk_node(0, node);
 	if (!disk)
-		goto out_free_id;
+		goto out_unlink_ns;
 
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
@@ -2639,18 +2774,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
 			ns->disk->disk_name);
 	return;
+ out_unlink_ns:
+	mutex_lock(&ctrl->subsys->lock);
+	list_del_rcu(&ns->siblings);
+	mutex_unlock(&ctrl->subsys->lock);
  out_free_id:
 	kfree(id);
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
- out_release_instance:
-	ida_simple_remove(&ctrl->ns_ida, ns->instance);
  out_free_ns:
 	kfree(ns);
 }
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
+	struct nvme_ns_head *head = ns->head;
+
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
@@ -2665,10 +2804,16 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		blk_cleanup_queue(ns->queue);
 	}
 
+	mutex_lock(&ns->ctrl->subsys->lock);
+	if (head)
+		list_del_rcu(&ns->siblings);
+	mutex_unlock(&ns->ctrl->subsys->lock);
+
 	mutex_lock(&ns->ctrl->namespaces_mutex);
 	list_del_init(&ns->list);
 	mutex_unlock(&ns->ctrl->namespaces_mutex);
 
+	synchronize_srcu(&head->srcu);
 	nvme_put_ns(ns);
 }
 
@@ -2691,7 +2836,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
-		if (ns->ns_id > nsid)
+		if (ns->head->ns_id > nsid)
 			nvme_ns_remove(ns);
 	}
 }
@@ -2961,7 +3106,6 @@ static void nvme_free_ctrl(struct device *dev)
 	struct nvme_subsystem *subsys = ctrl->subsys;
 
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
-	ida_destroy(&ctrl->ns_ida);
 	kfree(ctrl->effects);
 
 	if (subsys) {
@@ -3022,8 +3166,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	if (ret)
 		goto out_free_name;
 
-	ida_init(&ctrl->ns_ida);
-
 	/*
 	 * Initialize latency tolerance controls.  The sysfs files won't
 	 * be visible to userspace unless the device actually supports APST.
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 8fc949c5b49b7..ba3d7f3349e5d 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
 	int ret;
 
 	c.identity.opcode = nvme_nvm_admin_identity;
-	c.identity.nsid = cpu_to_le32(ns->ns_id);
+	c.identity.nsid = cpu_to_le32(ns->head->ns_id);
 	c.identity.chnl_off = 0;
 
 	nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL);
@@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 	int ret = 0;
 
 	c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
-	c.l2p.nsid = cpu_to_le32(ns->ns_id);
+	c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
 	entries = kmalloc(len, GFP_KERNEL);
 	if (!entries)
 		return -ENOMEM;
@@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 	int ret = 0;
 
 	c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl;
-	c.get_bb.nsid = cpu_to_le32(ns->ns_id);
+	c.get_bb.nsid = cpu_to_le32(ns->head->ns_id);
 	c.get_bb.spba = cpu_to_le64(ppa.ppa);
 
 	bb_tbl = kzalloc(tblsz, GFP_KERNEL);
@@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
 	int ret = 0;
 
 	c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl;
-	c.set_bb.nsid = cpu_to_le32(ns->ns_id);
+	c.set_bb.nsid = cpu_to_le32(ns->head->ns_id);
 	c.set_bb.spba = cpu_to_le64(ppas->ppa);
 	c.set_bb.nlb = cpu_to_le16(nr_ppas - 1);
 	c.set_bb.value = type;
@@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
 				    struct nvme_nvm_command *c)
 {
 	c->ph_rw.opcode = rqd->opcode;
-	c->ph_rw.nsid = cpu_to_le32(ns->ns_id);
+	c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
 	c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa);
 	c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
 	c->ph_rw.control = cpu_to_le16(rqd->flags);
@@ -731,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns,
 
 	memset(&c, 0, sizeof(c));
 	c.ph_rw.opcode = vio.opcode;
-	c.ph_rw.nsid = cpu_to_le32(ns->ns_id);
+	c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id);
 	c.ph_rw.control = cpu_to_le16(vio.control);
 	c.ph_rw.length = cpu_to_le16(vio.nppas);
 
@@ -768,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = vcmd.opcode;
-	c.common.nsid = cpu_to_le32(ns->ns_id);
+	c.common.nsid = cpu_to_le32(ns->head->ns_id);
 	c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2);
 	c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
 	/* cdw11-12 */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2f1af915f93a5..6e5004b009753 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -136,7 +136,6 @@ struct nvme_ctrl {
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 	struct cdev cdev;
-	struct ida ns_ida;
 	struct work_struct reset_work;
 	struct work_struct delete_work;
 
@@ -209,12 +208,14 @@ struct nvme_subsystem {
 	struct list_head	entry;
 	struct mutex		lock;
 	struct list_head	ctrls;
+	struct list_head	nsheads;
 	char			subnqn[NVMF_NQN_SIZE];
 	char			serial[20];
 	char			model[40];
 	char			firmware_rev[8];
 	u8			cmic;
 	u16			vendor_id;
+	struct ida		ns_ida;
 };
 
 /*
@@ -226,18 +227,35 @@ struct nvme_ns_ids {
 	uuid_t	uuid;
 };
 
+/*
+ * Anchor structure for namespaces.  There is one for each namespace in a
+ * NVMe subsystem that any of our controllers can see, and the namespace
+ * structure for each controller is chained of it.  For private namespaces
+ * there is a 1:1 relation to our namespace structures, that is ->list
+ * only ever has a single entry for private namespaces.
+ */
+struct nvme_ns_head {
+	struct list_head	list;
+	struct srcu_struct      srcu;
+	struct nvme_subsystem	*subsys;
+	unsigned		ns_id;
+	struct nvme_ns_ids	ids;
+	struct list_head	entry;
+	struct kref		ref;
+	int			instance;
+};
+
 struct nvme_ns {
 	struct list_head list;
 
 	struct nvme_ctrl *ctrl;
 	struct request_queue *queue;
 	struct gendisk *disk;
+	struct list_head siblings;
 	struct nvm_dev *ndev;
 	struct kref kref;
-	int instance;
+	struct nvme_ns_head *head;
 
-	unsigned ns_id;
-	struct nvme_ns_ids ids;
 	int lba_shift;
 	u16 ms;
 	u16 sgs;
-- 
cgit v1.2.3


From 32acab3181c7053c775ca128c3a5c6ce50197d7f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 Nov 2017 12:59:30 +0100
Subject: nvme: implement multipath access to nvme subsystems

This patch adds native multipath support to the nvme driver.  For each
namespace we create only single block device node, which can be used
to access that namespace through any of the controllers that refer to it.
The gendisk for each controllers path to the name space still exists
inside the kernel, but is hidden from userspace.  The character device
nodes are still available on a per-controller basis.  A new link from
the sysfs directory for the subsystem allows to find all controllers
for a given subsystem.

Currently we will always send I/O to the first available path, this will
be changed once the NVMe Asynchronous Namespace Access (ANA) TP is
ratified and implemented, at which point we will look at the ANA state
for each namespace.  Another possibility that was prototyped is to
use the path that is closes to the submitting NUMA code, which will be
mostly interesting for PCI, but might also be useful for RDMA or FC
transports in the future.  There is not plan to implement round robin
or I/O service time path selectors, as those are not scalable with
the performance rates provided by NVMe.

The multipath device will go away once all paths to it disappear,
any delay to keep it alive needs to be implemented at the controller
level.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/Kconfig     |   9 ++
 drivers/nvme/host/Makefile    |   1 +
 drivers/nvme/host/core.c      | 147 +++++++++++++++++++++---
 drivers/nvme/host/multipath.c | 255 ++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      |  57 ++++++++++
 5 files changed, 455 insertions(+), 14 deletions(-)
 create mode 100644 drivers/nvme/host/multipath.c

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 46d6cb1e03bd0..b979cf3bce65f 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -13,6 +13,15 @@ config BLK_DEV_NVME
 	  To compile this driver as a module, choose M here: the
 	  module will be called nvme.
 
+config NVME_MULTIPATH
+	bool "NVMe multipath support"
+	depends on NVME_CORE
+	---help---
+	   This option enables support for multipath access to NVMe
+	   subsystems.  If this option is enabled only a single
+	   /dev/nvmeXnY device will show up for each NVMe namespaces,
+	   even if it is accessible through multiple controllers.
+
 config NVME_FABRICS
 	tristate
 
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index cc0aacb4c8b47..b856f2f549cda 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
 
 nvme-core-y				:= core.o
+nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
 
 nvme-y					+= pci.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 13676f6cd4f67..59f80a613fd85 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -185,17 +185,22 @@ static inline bool nvme_req_needs_retry(struct request *req)
 		return false;
 	if (nvme_req(req)->retries >= nvme_max_retries)
 		return false;
-	if (blk_queue_dying(req->q))
-		return false;
 	return true;
 }
 
 void nvme_complete_rq(struct request *req)
 {
 	if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
-		nvme_req(req)->retries++;
-		blk_mq_requeue_request(req, true);
-		return;
+		if (nvme_req_needs_failover(req)) {
+			nvme_failover_req(req);
+			return;
+		}
+
+		if (!blk_queue_dying(req->q)) {
+			nvme_req(req)->retries++;
+			blk_mq_requeue_request(req, true);
+			return;
+		}
 	}
 
 	blk_mq_end_request(req, nvme_error_status(req));
@@ -286,7 +291,8 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		ctrl->state = new_state;
 
 	spin_unlock_irqrestore(&ctrl->lock, flags);
-
+	if (changed && ctrl->state == NVME_CTRL_LIVE)
+		nvme_kick_requeue_lists(ctrl);
 	return changed;
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
@@ -296,6 +302,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	struct nvme_ns_head *head =
 		container_of(ref, struct nvme_ns_head, ref);
 
+	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	list_del_init(&head->entry);
 	cleanup_srcu_struct(&head->srcu);
@@ -1138,11 +1145,33 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return status;
 }
 
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
-		unsigned int cmd, unsigned long arg)
+/*
+ * Issue ioctl requests on the first available path.  Note that unlike normal
+ * block layer requests we will not retry failed request on another controller.
+ */
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx)
 {
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
+#ifdef CONFIG_NVME_MULTIPATH
+	if (disk->fops == &nvme_ns_head_ops) {
+		*head = disk->private_data;
+		*srcu_idx = srcu_read_lock(&(*head)->srcu);
+		return nvme_find_path(*head);
+	}
+#endif
+	*head = NULL;
+	*srcu_idx = -1;
+	return disk->private_data;
+}
+
+static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+{
+	if (head)
+		srcu_read_unlock(&head->srcu, idx);
+}
 
+static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+{
 	switch (cmd) {
 	case NVME_IOCTL_ID:
 		force_successful_syscall_return();
@@ -1165,10 +1194,31 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		ret = -EWOULDBLOCK;
+	else
+		ret = nvme_ns_ioctl(ns, cmd, arg);
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+
 static int nvme_open(struct block_device *bdev, fmode_t mode)
 {
 	struct nvme_ns *ns = bdev->bd_disk->private_data;
 
+#ifdef CONFIG_NVME_MULTIPATH
+	/* should never be called due to GENHD_FL_HIDDEN */
+	if (WARN_ON_ONCE(ns->head->disk))
+		return -ENXIO;
+#endif
 	if (!kref_get_unless_zero(&ns->kref))
 		return -ENXIO;
 	return 0;
@@ -1329,6 +1379,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
 	nvme_update_disk_info(disk, ns, id);
+#ifdef CONFIG_NVME_MULTIPATH
+	if (ns->head->disk)
+		nvme_update_disk_info(ns->head->disk, ns, id);
+#endif
 }
 
 static int nvme_revalidate_disk(struct gendisk *disk)
@@ -1388,8 +1442,10 @@ static char nvme_pr_type(enum pr_type type)
 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 				u64 key, u64 sa_key, u8 op)
 {
-	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
 	struct nvme_command c;
+	int srcu_idx, ret;
 	u8 data[16] = { 0, };
 
 	put_unaligned_le64(key, &data[0]);
@@ -1397,10 +1453,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = op;
-	c.common.nsid = cpu_to_le32(ns->head->ns_id);
+	c.common.nsid = cpu_to_le32(head->ns_id);
 	c.common.cdw10[0] = cpu_to_le32(cdw10);
 
-	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		ret = -EWOULDBLOCK;
+	else
+		ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
 }
 
 static int nvme_pr_register(struct block_device *bdev, u64 old,
@@ -1490,6 +1552,32 @@ static const struct block_device_operations nvme_fops = {
 	.pr_ops		= &nvme_pr_ops,
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns_head *head = bdev->bd_disk->private_data;
+
+	if (!kref_get_unless_zero(&head->ref))
+		return -ENXIO;
+	return 0;
+}
+
+static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
+{
+	nvme_put_ns_head(disk->private_data);
+}
+
+const struct block_device_operations nvme_ns_head_ops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_ns_head_open,
+	.release	= nvme_ns_head_release,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_ioctl,
+	.getgeo		= nvme_getgeo,
+	.pr_ops		= &nvme_pr_ops,
+};
+#endif /* CONFIG_NVME_MULTIPATH */
+
 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
 {
 	unsigned long timeout =
@@ -2592,6 +2680,10 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 		goto out_cleanup_srcu;
 	}
 
+	ret = nvme_mpath_alloc_disk(ctrl, head);
+	if (ret)
+		goto out_cleanup_srcu;
+
 	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
 	return head;
 out_cleanup_srcu:
@@ -2704,7 +2796,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
-	int node = dev_to_node(ctrl->dev);
+	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
 	bool new = true;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
@@ -2735,7 +2827,30 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (nvme_init_ns_head(ns, nsid, id, &new))
 		goto out_free_id;
 	
+#ifdef CONFIG_NVME_MULTIPATH
+	/*
+	 * If multipathing is enabled we need to always use the subsystem
+	 * instance number for numbering our devices to avoid conflicts
+	 * between subsystems that have multiple controllers and thus use
+	 * the multipath-aware subsystem node and those that have a single
+	 * controller and use the controller node directly.
+	 */
+	if (ns->head->disk) {
+		sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
+				ctrl->cntlid, ns->head->instance);
+		flags = GENHD_FL_HIDDEN;
+	} else {
+		sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
+				ns->head->instance);
+	}
+#else
+	/*
+	 * But without the multipath code enabled, multiple controller per
+	 * subsystems are visible as devices and thus we cannot use the
+	 * subsystem instance.
+	 */
 	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+#endif
 
 	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
 		if (nvme_nvm_register(ns, disk_name, node)) {
@@ -2751,7 +2866,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	disk->fops = &nvme_fops;
 	disk->private_data = ns;
 	disk->queue = ns->queue;
-	disk->flags = GENHD_FL_EXT_DEVT;
+	disk->flags = flags;
 	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
 	ns->disk = disk;
 
@@ -2773,6 +2888,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (ns->ndev && nvme_nvm_register_sysfs(ns))
 		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
 			ns->disk->disk_name);
+
+	if (new)
+		nvme_mpath_add_disk(ns->head);
 	return;
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
@@ -2805,6 +2923,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	}
 
 	mutex_lock(&ns->ctrl->subsys->lock);
+	nvme_mpath_clear_current_path(ns);
 	if (head)
 		list_del_rcu(&ns->siblings);
 	mutex_unlock(&ns->ctrl->subsys->lock);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
new file mode 100644
index 0000000000000..062754ebebfd6
--- /dev/null
+++ b/drivers/nvme/host/multipath.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/moduleparam.h>
+#include "nvme.h"
+
+static bool multipath = true;
+module_param(multipath, bool, 0644);
+MODULE_PARM_DESC(multipath,
+	"turn on native support for multiple controllers per subsystem");
+
+void nvme_failover_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	blk_steal_bios(&ns->head->requeue_list, req);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+	blk_mq_end_request(req, 0);
+
+	nvme_reset_ctrl(ns->ctrl);
+	kblockd_schedule_work(&ns->head->requeue_work);
+}
+
+bool nvme_req_needs_failover(struct request *req)
+{
+	if (!(req->cmd_flags & REQ_NVME_MPATH))
+		return false;
+
+	switch (nvme_req(req)->status & 0x7ff) {
+	/*
+	 * Generic command status:
+	 */
+	case NVME_SC_INVALID_OPCODE:
+	case NVME_SC_INVALID_FIELD:
+	case NVME_SC_INVALID_NS:
+	case NVME_SC_LBA_RANGE:
+	case NVME_SC_CAP_EXCEEDED:
+	case NVME_SC_RESERVATION_CONFLICT:
+		return false;
+
+	/*
+	 * I/O command set specific error.  Unfortunately these values are
+	 * reused for fabrics commands, but those should never get here.
+	 */
+	case NVME_SC_BAD_ATTRIBUTES:
+	case NVME_SC_INVALID_PI:
+	case NVME_SC_READ_ONLY:
+	case NVME_SC_ONCS_NOT_SUPPORTED:
+		WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
+			nvme_fabrics_command);
+		return false;
+
+	/*
+	 * Media and Data Integrity Errors:
+	 */
+	case NVME_SC_WRITE_FAULT:
+	case NVME_SC_READ_ERROR:
+	case NVME_SC_GUARD_CHECK:
+	case NVME_SC_APPTAG_CHECK:
+	case NVME_SC_REFTAG_CHECK:
+	case NVME_SC_COMPARE_FAILED:
+	case NVME_SC_ACCESS_DENIED:
+	case NVME_SC_UNWRITTEN_BLOCK:
+		return false;
+	}
+
+	/* Everything else could be a path failure, so should be retried */
+	return true;
+}
+
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->head->disk)
+			kblockd_schedule_work(&ns->head->requeue_work);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (ns->ctrl->state == NVME_CTRL_LIVE) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+	}
+
+	return NULL;
+}
+
+inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+
+	if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
+		ns = __nvme_find_path(head);
+	return ns;
+}
+
+static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
+		struct bio *bio)
+{
+	struct nvme_ns_head *head = q->queuedata;
+	struct device *dev = disk_to_dev(head->disk);
+	struct nvme_ns *ns;
+	blk_qc_t ret = BLK_QC_T_NONE;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	ns = nvme_find_path(head);
+	if (likely(ns)) {
+		bio->bi_disk = ns->disk;
+		bio->bi_opf |= REQ_NVME_MPATH;
+		ret = direct_make_request(bio);
+	} else if (!list_empty_careful(&head->list)) {
+		dev_warn_ratelimited(dev, "no path available - requeing I/O\n");
+
+		spin_lock_irq(&head->requeue_lock);
+		bio_list_add(&head->requeue_list, bio);
+		spin_unlock_irq(&head->requeue_lock);
+	} else {
+		dev_warn_ratelimited(dev, "no path - failing I/O\n");
+
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+	}
+
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return ret;
+}
+
+static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+{
+	struct nvme_ns_head *head = q->queuedata;
+	struct nvme_ns *ns;
+	bool found = false;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&head->srcu);
+	ns = srcu_dereference(head->current_path, &head->srcu);
+	if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
+		found = ns->queue->poll_fn(q, qc);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return found;
+}
+
+static void nvme_requeue_work(struct work_struct *work)
+{
+	struct nvme_ns_head *head =
+		container_of(work, struct nvme_ns_head, requeue_work);
+	struct bio *bio, *next;
+
+	spin_lock_irq(&head->requeue_lock);
+	next = bio_list_get(&head->requeue_list);
+	spin_unlock_irq(&head->requeue_lock);
+
+	while ((bio = next) != NULL) {
+		next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		/*
+		 * Reset disk to the mpath node and resubmit to select a new
+		 * path.
+		 */
+		bio->bi_disk = head->disk;
+		generic_make_request(bio);
+	}
+}
+
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
+{
+	struct request_queue *q;
+	bool vwc = false;
+
+	bio_list_init(&head->requeue_list);
+	spin_lock_init(&head->requeue_lock);
+	INIT_WORK(&head->requeue_work, nvme_requeue_work);
+
+	/*
+	 * Add a multipath node if the subsystems supports multiple controllers.
+	 * We also do this for private namespaces as the namespace sharing data could
+	 * change after a rescan.
+	 */
+	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
+		return 0;
+
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
+	if (!q)
+		goto out;
+	q->queuedata = head;
+	blk_queue_make_request(q, nvme_ns_head_make_request);
+	q->poll_fn = nvme_ns_head_poll;
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	/* set to a default value for 512 until disk is validated */
+	blk_queue_logical_block_size(q, 512);
+
+	/* we need to propagate up the VMC settings */
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		vwc = true;
+	blk_queue_write_cache(q, vwc, vwc);
+
+	head->disk = alloc_disk(0);
+	if (!head->disk)
+		goto out_cleanup_queue;
+	head->disk->fops = &nvme_ns_head_ops;
+	head->disk->private_data = head;
+	head->disk->queue = q;
+	head->disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(head->disk->disk_name, "nvme%dn%d",
+			ctrl->subsys->instance, head->instance);
+	return 0;
+
+out_cleanup_queue:
+	blk_cleanup_queue(q);
+out:
+	return -ENOMEM;
+}
+
+void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
+	device_add_disk(&head->subsys->dev, head->disk);
+}
+
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+	if (!head->disk)
+		return;
+	del_gendisk(head->disk);
+	blk_set_queue_dying(head->disk->queue);
+	/* make sure all pending bios are cleaned up */
+	kblockd_schedule_work(&head->requeue_work);
+	flush_work(&head->requeue_work);
+	blk_cleanup_queue(head->disk->queue);
+	put_disk(head->disk);
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6e5004b009753..dd6ced664b459 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -95,6 +95,11 @@ struct nvme_request {
 	u16			status;
 };
 
+/*
+ * Mark a bio as coming in through the mpath node.
+ */
+#define REQ_NVME_MPATH		REQ_DRV
+
 enum {
 	NVME_REQ_CANCELLED		= (1 << 0),
 };
@@ -235,6 +240,13 @@ struct nvme_ns_ids {
  * only ever has a single entry for private namespaces.
  */
 struct nvme_ns_head {
+#ifdef CONFIG_NVME_MULTIPATH
+	struct gendisk		*disk;
+	struct nvme_ns __rcu	*current_path;
+	struct bio_list		requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+#endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
 	struct nvme_subsystem	*subsys;
@@ -384,6 +396,51 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 
+extern const struct block_device_operations nvme_ns_head_ops;
+
+#ifdef CONFIG_NVME_MULTIPATH
+void nvme_failover_req(struct request *req);
+bool nvme_req_needs_failover(struct request *req);
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+	struct nvme_ns_head *head = ns->head;
+
+	if (head && ns == srcu_dereference(head->current_path, &head->srcu))
+		rcu_assign_pointer(head->current_path, NULL);
+}
+struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+#else
+static inline void nvme_failover_req(struct request *req)
+{
+}
+static inline bool nvme_req_needs_failover(struct request *req)
+{
+	return false;
+}
+static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
+		struct nvme_ns_head *head)
+{
+	return 0;
+}
+static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+}
+static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_NVME_MULTIPATH */
+
 #ifdef CONFIG_NVM
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
-- 
cgit v1.2.3


From 5b85b826b8925b3f1910b6613eb82c4df240a5b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Nov 2017 13:51:03 +0100
Subject: nvme: also expose the namespace identification sysfs files for mpath
 nodes

We do this by adding a helper that returns the ns_head for a device that
can belong to either the per-controller or per-subsystem block device
nodes, and otherwise reuse all the existing code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c      | 57 +++++++++++++++++++++++--------------------
 drivers/nvme/host/multipath.c |  6 +++++
 drivers/nvme/host/nvme.h      |  1 +
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 59f80a613fd85..438d3fa126084 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2382,12 +2382,22 @@ static ssize_t nvme_sysfs_rescan(struct device *dev,
 }
 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
 
+static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (disk->fops == &nvme_fops)
+		return nvme_get_ns_from_dev(dev)->head;
+	else
+		return disk->private_data;
+}
+
 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
-								char *buf)
+		char *buf)
 {
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->head->ids;
-	struct nvme_subsystem *subsys = ns->ctrl->subsys;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+	struct nvme_ns_ids *ids = &head->ids;
+	struct nvme_subsystem *subsys = head->subsys;
 	int serial_len = sizeof(subsys->serial);
 	int model_len = sizeof(subsys->model);
 
@@ -2409,23 +2419,21 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 
 	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
 		serial_len, subsys->serial, model_len, subsys->model,
-		ns->head->ns_id);
+		head->ns_id);
 }
 static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
 
 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
+		char *buf)
 {
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%pU\n", ns->head->ids.nguid);
+	return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
 }
 static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL);
 
 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
-								char *buf)
+		char *buf)
 {
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->head->ids;
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
 
 	/* For backward compatibility expose the NGUID to userspace if
 	 * we have no UUID set
@@ -2440,22 +2448,20 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
 
 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
-								char *buf)
+		char *buf)
 {
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%8ph\n", ns->head->ids.eui64);
+	return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 
 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
-								char *buf)
+		char *buf)
 {
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	return sprintf(buf, "%d\n", ns->head->ns_id);
+	return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
 }
 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
 
-static struct attribute *nvme_ns_attrs[] = {
+static struct attribute *nvme_ns_id_attrs[] = {
 	&dev_attr_wwid.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_nguid.attr,
@@ -2464,12 +2470,11 @@ static struct attribute *nvme_ns_attrs[] = {
 	NULL,
 };
 
-static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
+static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
 		struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
-	struct nvme_ns_ids *ids = &ns->head->ids;
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
 
 	if (a == &dev_attr_uuid.attr) {
 		if (uuid_is_null(&ids->uuid) ||
@@ -2487,9 +2492,9 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
-static const struct attribute_group nvme_ns_attr_group = {
-	.attrs		= nvme_ns_attrs,
-	.is_visible	= nvme_ns_attrs_are_visible,
+const struct attribute_group nvme_ns_id_attr_group = {
+	.attrs		= nvme_ns_id_attrs,
+	.is_visible	= nvme_ns_id_attrs_are_visible,
 };
 
 #define nvme_show_str_function(field)						\
@@ -2882,7 +2887,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	device_add_disk(ctrl->device, ns->disk);
 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
-					&nvme_ns_attr_group))
+					&nvme_ns_id_attr_group))
 		pr_warn("%s: failed to create sysfs group for identification\n",
 			ns->disk->disk_name);
 	if (ns->ndev && nvme_nvm_register_sysfs(ns))
@@ -2915,7 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
-					&nvme_ns_attr_group);
+					&nvme_ns_id_attr_group);
 		if (ns->ndev)
 			nvme_nvm_unregister_sysfs(ns);
 		del_gendisk(ns->disk);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 062754ebebfd6..850275896e498 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -239,12 +239,18 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head)
 	if (!head->disk)
 		return;
 	device_add_disk(&head->subsys->dev, head->disk);
+	if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
+			&nvme_ns_id_attr_group))
+		pr_warn("%s: failed to create sysfs group for identification\n",
+			head->disk->disk_name);
 }
 
 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 	if (!head->disk)
 		return;
+	sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
+			   &nvme_ns_id_attr_group);
 	del_gendisk(head->disk);
 	blk_set_queue_dying(head->disk->queue);
 	/* make sure all pending bios are cleaned up */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index dd6ced664b459..895a4330dda00 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -396,6 +396,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 
+extern const struct attribute_group nvme_ns_id_attr_group;
 extern const struct block_device_operations nvme_ns_head_ops;
 
 #ifdef CONFIG_NVME_MULTIPATH
-- 
cgit v1.2.3


From 17eac0996341cfd28b4093554e662fd82aa1c237 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 9 Nov 2017 17:57:06 +0100
Subject: block: create 'slaves' and 'holders' entries for hidden gendisks

When creating nvme multipath devices we should populate the 'slaves' and
'holders' directorys properly to aid userspace topology detection.

Signed-off-by: Hannes Reinecke <hare@suse.com>
[hch: split from a larger patch]
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 835e907e6e015..3de1671631bfd 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -585,14 +585,14 @@ static void register_disk(struct device *parent, struct gendisk *disk)
 	 */
 	pm_runtime_set_memalloc_noio(ddev, true);
 
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+
 	if (disk->flags & GENHD_FL_HIDDEN) {
 		dev_set_uevent_suppress(ddev, 0);
 		return;
 	}
 
-	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
-	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-
 	/* No minors to use for partitions */
 	if (!disk_part_scan_enabled(disk))
 		goto exit;
@@ -728,11 +728,11 @@ void del_gendisk(struct gendisk *disk)
 		WARN_ON(1);
 	}
 
-	if (!(disk->flags & GENHD_FL_HIDDEN)) {
+	if (!(disk->flags & GENHD_FL_HIDDEN))
 		blk_unregister_region(disk_devt(disk), disk->minors);
-		kobject_put(disk->part0.holder_dir);
-		kobject_put(disk->slave_dir);
-	}
+
+	kobject_put(disk->part0.holder_dir);
+	kobject_put(disk->slave_dir);
 
 	part_stat_set_all(&disk->part0, 0);
 	disk->part0.stamp = 0;
-- 
cgit v1.2.3


From e9a48034d7d1318ece7d4a235838a86c94db9d68 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 9 Nov 2017 17:57:06 +0100
Subject: nvme: create 'slaves' and 'holders' entries for hidden controllers

When creating nvme multipath devices we should populate the 'slaves' and
'holders' directorys properly to aid userspace topology detection.

Signed-off-by: Hannes Reinecke <hare@suse.com>
[hch: split from a larger patch, compile fix for CONFIG_NVME_MULTIPATH=n]
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c      |  2 ++
 drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      |  8 ++++++++
 3 files changed, 40 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 438d3fa126084..c3182aa654fd6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2896,6 +2896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	if (new)
 		nvme_mpath_add_disk(ns->head);
+	nvme_mpath_add_disk_links(ns);
 	return;
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
@@ -2919,6 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
+		nvme_mpath_remove_disk_links(ns);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_id_attr_group);
 		if (ns->ndev)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 850275896e498..78d92151a9042 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -245,6 +245,25 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head)
 			head->disk->disk_name);
 }
 
+void nvme_mpath_add_disk_links(struct nvme_ns *ns)
+{
+	struct kobject *slave_disk_kobj, *holder_disk_kobj;
+
+	if (!ns->head->disk)
+		return;
+
+	slave_disk_kobj = &disk_to_dev(ns->disk)->kobj;
+	if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj,
+			kobject_name(slave_disk_kobj)))
+		return;
+
+	holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj;
+	if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj,
+			kobject_name(holder_disk_kobj)))
+		sysfs_remove_link(ns->head->disk->slave_dir,
+			kobject_name(slave_disk_kobj));
+}
+
 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 	if (!head->disk)
@@ -259,3 +278,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 	blk_cleanup_queue(head->disk->queue);
 	put_disk(head->disk);
 }
+
+void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
+{
+	if (!ns->head->disk)
+		return;
+
+	sysfs_remove_link(ns->disk->part0.holder_dir,
+			kobject_name(&disk_to_dev(ns->head->disk)->kobj));
+	sysfs_remove_link(ns->head->disk->slave_dir,
+			kobject_name(&disk_to_dev(ns->disk)->kobj));
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 895a4330dda00..c0873a68872fb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -405,7 +405,9 @@ bool nvme_req_needs_failover(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns_head *head);
+void nvme_mpath_add_disk_links(struct nvme_ns *ns);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk_links(struct nvme_ns *ns);
 
 static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
@@ -437,6 +439,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 }
+static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns)
+{
+}
 static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 }
-- 
cgit v1.2.3


From 1e496938b6aec67b7cfd06614164fee0fe9f13e3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 10 Nov 2017 10:58:23 +0100
Subject: nvme: expose subsys attribute to sysfs

We should be exposing the subsystem attributes like 'model' and
'subsysnqn' to sysfs to allow for easier identification of the
subsystem.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c3182aa654fd6..7b3bbc1a9ac49 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1999,6 +1999,53 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
 	return NULL;
 }
 
+#define SUBSYS_ATTR_RO(_name, _mode, _show)			\
+	struct device_attribute subsys_attr_##_name = \
+		__ATTR(_name, _mode, _show, NULL)
+
+static ssize_t nvme_subsys_show_nqn(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+}
+static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
+
+#define nvme_subsys_show_str_function(field)				\
+static ssize_t subsys_##field##_show(struct device *dev,		\
+			    struct device_attribute *attr, char *buf)	\
+{									\
+	struct nvme_subsystem *subsys =					\
+		container_of(dev, struct nvme_subsystem, dev);		\
+	return sprintf(buf, "%.*s\n",					\
+		       (int)sizeof(subsys->field), subsys->field);	\
+}									\
+static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
+
+nvme_subsys_show_str_function(model);
+nvme_subsys_show_str_function(serial);
+nvme_subsys_show_str_function(firmware_rev);
+
+static struct attribute *nvme_subsys_attrs[] = {
+	&subsys_attr_model.attr,
+	&subsys_attr_serial.attr,
+	&subsys_attr_firmware_rev.attr,
+	&subsys_attr_subsysnqn.attr,
+	NULL,
+};
+
+static struct attribute_group nvme_subsys_attrs_group = {
+	.attrs = nvme_subsys_attrs,
+};
+
+static const struct attribute_group *nvme_subsys_attrs_groups[] = {
+	&nvme_subsys_attrs_group,
+	NULL,
+};
+
 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 {
 	struct nvme_subsystem *subsys, *found;
@@ -2026,6 +2073,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 
 	subsys->dev.class = nvme_subsys_class;
 	subsys->dev.release = nvme_release_subsystem;
+	subsys->dev.groups = nvme_subsys_attrs_groups;
 	dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
 	device_initialize(&subsys->dev);
 
-- 
cgit v1.2.3


From 0e78eccc552b6a9e9ce158aee72e95ae41e0e171 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 10 Nov 2017 09:17:50 +0100
Subject: xtensa/simdisk: fix compile error

Fixes: d004a5e7d4dd ("block: remove __bio_kmap_atomic")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/xtensa/platforms/iss/simdisk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index eacf1e433518b..1b6418407467d 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -116,7 +116,7 @@ static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio)
 		simdisk_transfer(dev, sector, len, buffer,
 				bio_data_dir(bio) == WRITE);
 		sector += len;
-		kunmap_atomic(buffer)
+		kunmap_atomic(buffer);
 	}
 
 	bio_endio(bio);
-- 
cgit v1.2.3


From 67f2519fe2903c4041c0e94394d14d372fe51399 Mon Sep 17 00:00:00 2001
From: Greg Edwards <gedwards@ddn.com>
Date: Tue, 24 Oct 2017 11:21:48 -0600
Subject: fs: guard_bio_eod() needs to consider partitions

guard_bio_eod() needs to look at the partition capacity, not just the
capacity of the whole device, when determining if truncation is
necessary.

[   60.268688] attempt to access beyond end of device
[   60.268690] unknown-block(9,1): rw=0, want=67103509, limit=67103506
[   60.268693] buffer_io_error: 2 callbacks suppressed
[   60.268696] Buffer I/O error on dev md1p7, logical block 4524305, async page read

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index")
Cc: stable@vger.kernel.org # v4.13
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Edwards <gedwards@ddn.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c           | 10 +++++++++-
 include/linux/genhd.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index bff571dc7bc3b..bcabb69e7462e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3015,8 +3015,16 @@ void guard_bio_eod(int op, struct bio *bio)
 	sector_t maxsector;
 	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	unsigned truncated_bytes;
+	struct hd_struct *part;
+
+	rcu_read_lock();
+	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (part)
+		maxsector = part_nr_sects_read(part);
+	else
+		maxsector = get_capacity(bio->bi_disk);
+	rcu_read_unlock();
 
-	maxsector = get_capacity(bio->bi_disk);
 	if (!maxsector)
 		return;
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 93aae3476f58b..ca10cc292187d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -243,6 +243,7 @@ static inline dev_t part_devt(struct hd_struct *part)
 	return part_to_dev(part)->devt;
 }
 
+extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
 
 static inline void disk_put_part(struct hd_struct *part)
-- 
cgit v1.2.3


From f0fba398fec65ff5a2e1bf8ae62718ec0450abaf Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 6 Nov 2017 17:53:53 +0000
Subject: block: avoid null pointer dereference on null disk

It is possible that the pointer disk can be null and hence
we can get a null pointer deference when accessing disk->flags.
Add a null pointer check to avoid the dereference.

Detected by CoverityScan, CID#1461133 ("Explicit null dereferenced")

Fixes: 8ddcd653257c ("block: introduce GENHD_FL_HIDDEN")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 3de1671631bfd..997e598f3b86e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -798,7 +798,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		spin_unlock_bh(&ext_devt_lock);
 	}
 
-	if (unlikely(disk->flags & GENHD_FL_HIDDEN)) {
+	if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
 		put_disk(disk);
 		disk = NULL;
 	}
-- 
cgit v1.2.3


From 79f720a751cad613620d0237e3b44f89f4a69181 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Nov 2017 09:13:21 -0700
Subject: blk-mq: only run the hardware queue if IO is pending

Currently we are inconsistent in when we decide to run the queue. Using
blk_mq_run_hw_queues() we check if the hctx has pending IO before
running it, but we don't do that from the individual queue run function,
blk_mq_run_hw_queue(). This results in a lot of extra and pointless
queue runs, potentially, on flush requests and (much worse) on tag
starvation situations. This is observable just looking at top output,
with lots of kworkers active. For the !async runs, it just adds to the
CPU overhead of blk-mq.

Move the has-pending check into the run function instead of having
callers do it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   |  7 +------
 block/blk-mq.c         | 18 +++++++++++-------
 block/blk-mq.h         |  2 --
 include/linux/blk-mq.h |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6f4bdb8209f7d..c117bd8fd1f61 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -81,12 +81,7 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 	} else
 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
-	if (blk_mq_hctx_has_pending(hctx)) {
-		blk_mq_run_hw_queue(hctx, true);
-		return true;
-	}
-
-	return false;
+	return blk_mq_run_hw_queue(hctx, true);
 }
 
 /*
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bfe24a5b62a36..a2a4271f5ab87 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -61,10 +61,10 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map) ||
-			!list_empty_careful(&hctx->dispatch) ||
+	return !list_empty_careful(&hctx->dispatch) ||
+		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
 
@@ -1253,9 +1253,14 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 }
 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	__blk_mq_delay_run_hw_queue(hctx, async, 0);
+	if (blk_mq_hctx_has_pending(hctx)) {
+		__blk_mq_delay_run_hw_queue(hctx, async, 0);
+		return true;
+	}
+
+	return false;
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
@@ -1265,8 +1270,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!blk_mq_hctx_has_pending(hctx) ||
-		    blk_mq_hctx_stopped(hctx))
+		if (blk_mq_hctx_stopped(hctx))
 			continue;
 
 		blk_mq_run_hw_queue(hctx, async);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 99a19c5523e2b..dcf379a892dda 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -26,14 +26,12 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 				bool wait);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b326208277ee4..eb1e2cdffb317 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -266,7 +266,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3


From 15f7b41f70ddcca3b555bd0fdc7c8da7466b517e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 10 Nov 2017 12:29:34 -0500
Subject: brd: remove unused brd_mutex

Remove unused mutex brd_mutex. It is unused since the commit ff26956875c2
("brd: remove support for BLKFLSBUF").

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/brd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2d7178f7754ed..c1cf87718c2e0 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -60,7 +60,6 @@ struct brd_device {
 /*
  * Look up and return a brd's page for a given sector.
  */
-static DEFINE_MUTEX(brd_mutex);
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
 	pgoff_t idx;
-- 
cgit v1.2.3


From f906a6a0f42684715b552ccff9282b22cfe2b5a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Nov 2017 16:10:13 -0700
Subject: blk-mq: improve tag waiting setup for non-shared tags

If we run out of driver tags, we currently treat shared and non-shared
tags the same - both cases hook into the tag waitqueue. This is a bit
more costly than it needs to be on unshared tags, since we have to both
grab the hctx lock, and the waitqueue lock (and disable interrupts).
For the non-shared case, we can simply mark the queue as needing a
restart.

Split blk_mq_dispatch_wait_add() to account for both cases, and
rename it to blk_mq_mark_tag_wait() to better reflect what it
does now.

Without this patch, shared and non-shared performance is about the same
with 4 fio thread hammering on a single null_blk device (~410K, at 75%
sys). With the patch, the shared case is the same, but the non-shared
tags case runs at 431K at 71% sys.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 81 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a2a4271f5ab87..3295859f419dd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1006,44 +1006,68 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 	return 1;
 }
 
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
-				     struct request *rq)
+/*
+ * Mark us waiting for a tag. For shared tags, this involves hooking us into
+ * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
+ * restart. For both caes, take care to check the condition again after
+ * marking us as waiting.
+ */
+static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
+				 struct request *rq)
 {
 	struct blk_mq_hw_ctx *this_hctx = *hctx;
-	wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
+	bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
 	struct sbq_wait_state *ws;
+	wait_queue_entry_t *wait;
+	bool ret;
 
-	if (!list_empty_careful(&wait->entry))
-		return false;
+	if (!shared_tags) {
+		if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
+			set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
+	} else {
+		wait = &this_hctx->dispatch_wait;
+		if (!list_empty_careful(&wait->entry))
+			return false;
 
-	spin_lock(&this_hctx->lock);
-	if (!list_empty(&wait->entry)) {
-		spin_unlock(&this_hctx->lock);
-		return false;
-	}
+		spin_lock(&this_hctx->lock);
+		if (!list_empty(&wait->entry)) {
+			spin_unlock(&this_hctx->lock);
+			return false;
+		}
 
-	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-	add_wait_queue(&ws->wait, wait);
+		ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+		add_wait_queue(&ws->wait, wait);
+	}
 
 	/*
 	 * It's possible that a tag was freed in the window between the
 	 * allocation failure and adding the hardware queue to the wait
 	 * queue.
 	 */
-	if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+	ret = blk_mq_get_driver_tag(rq, hctx, false);
+
+	if (!shared_tags) {
+		/*
+		 * Don't clear RESTART here, someone else could have set it.
+		 * At most this will cost an extra queue run.
+		 */
+		return ret;
+	} else {
+		if (!ret) {
+			spin_unlock(&this_hctx->lock);
+			return false;
+		}
+
+		/*
+		 * We got a tag, remove ourselves from the wait queue to ensure
+		 * someone else gets the wakeup.
+		 */
+		spin_lock_irq(&ws->wait.lock);
+		list_del_init(&wait->entry);
+		spin_unlock_irq(&ws->wait.lock);
 		spin_unlock(&this_hctx->lock);
-		return false;
+		return true;
 	}
-
-	/*
-	 * We got a tag, remove ourselves from the wait queue to ensure
-	 * someone else gets the wakeup.
-	 */
-	spin_lock_irq(&ws->wait.lock);
-	list_del_init(&wait->entry);
-	spin_unlock_irq(&ws->wait.lock);
-	spin_unlock(&this_hctx->lock);
-	return true;
 }
 
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@ -1076,10 +1100,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			 * before we add this entry back on the dispatch list,
 			 * we'll re-run it below.
 			 */
-			if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
+			if (!blk_mq_mark_tag_wait(&hctx, rq)) {
 				if (got_budget)
 					blk_mq_put_dispatch_budget(hctx);
-				no_tag = true;
+				/*
+				 * For non-shared tags, the RESTART check
+				 * will suffice.
+				 */
+				if (hctx->flags & BLK_MQ_F_TAG_SHARED)
+					no_tag = true;
 				break;
 			}
 		}
-- 
cgit v1.2.3


From 8dc7a31fbce5e2dbbacd83d910da37105181b054 Mon Sep 17 00:00:00 2001
From: Hongxu Jia <hongxu.jia@windriver.com>
Date: Fri, 10 Nov 2017 15:59:17 +0800
Subject: ide: ide-atapi: fix compile error with defining macro DEBUG

Compile ide-atapi failed with defining macro "DEBUG"
...
|drivers/ide/ide-atapi.c:285:52: error: 'struct request' has
no member named 'cmd'; did you mean 'csd'?
|  debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]);
...

Since we split the scsi_request out of struct request, it missed
do the same thing on debug_log

Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")

Signed-off-by: Hongxu Jia <hongxu.jia@windriver.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 14d1e7d9a1d6f..0e6bc631a1caf 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive)
 	struct request *rq = drive->hwif->rq;
 	unsigned long wait = 0;
 
-	debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]);
+	debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]);
 
 	/*
 	 * Some commands are *slow* and normally take a long time to complete.
@@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 				return ide_do_reset(drive);
 			}
 
-			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
+			debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]);
 
 			/* Retry operation */
 			ide_retry_pc(drive);
@@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		ide_pad_transfer(drive, write, bcount);
 
 	debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n",
-		  rq->cmd[0], done, bcount, scsi_req(rq)->resid_len);
+		  scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len);
 
 	/* And set the interrupt handler again */
 	ide_set_handler(drive, ide_pc_intr, timeout);
-- 
cgit v1.2.3


From ff821d271415f17f1a704600420e92ebb9bfb32c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 10 Nov 2017 22:05:12 -0700
Subject: blk-mq: fixup some comment typos and lengths

Various typos and/or spelling errors in comments. Fixes a few > 80 char
lines as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3295859f419dd..b600463791ec8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -707,7 +707,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 
 	/*
 	 * We abuse this flag that is otherwise used by the I/O scheduler to
-	 * request head insertation from the workqueue.
+	 * request head insertion from the workqueue.
 	 */
 	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
@@ -1137,7 +1137,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		if (ret == BLK_STS_RESOURCE) {
 			/*
 			 * If an I/O scheduler has been configured and we got a
-			 * driver tag for the next request already, free it again.
+			 * driver tag for the next request already, free it
+			 * again.
 			 */
 			if (!list_empty(list)) {
 				nxt = list_first_entry(list, struct request, queuelist);
@@ -2299,8 +2300,11 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 
 	mutex_lock(&set->tag_list_lock);
 
-	/* Check to see if we're transitioning to shared (from 1 to 2 queues). */
-	if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
+	/*
+	 * Check to see if we're transitioning to shared (from 1 to 2 queues).
+	 */
+	if (!list_empty(&set->tag_list) &&
+	    !(set->flags & BLK_MQ_F_TAG_SHARED)) {
 		set->flags |= BLK_MQ_F_TAG_SHARED;
 		/* update existing queue */
 		blk_mq_update_tag_set_depth(set, true);
@@ -2532,10 +2536,9 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
-	 * we should change hctx numa_node according to new topology (this
-	 * involves free and re-allocate memory, worthy doing?)
+	 * we should change hctx numa_node according to the new topology (this
+	 * involves freeing and re-allocating memory, worth doing?)
 	 */
-
 	blk_mq_map_swqueue(q);
 
 	blk_mq_sysfs_register(q);
-- 
cgit v1.2.3


From a04b5de5050ab8b891128eb2c47a0916fe8622e1 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Thu, 28 Sep 2017 21:33:23 +0200
Subject: nvme: fix visibility of "uuid" ns attribute

"uuid" must be invisible if both ns->uuid and ns->nguid are unset,
not if either one is.

Fixes: d934f9848a77 "nvme: provide UUID value to userspace"
Signed-off-by: Martin Wilck <mwilck@suse.com>
[hch: rebased to the nvme-4.15 tree to help resolving a conflict]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7b3bbc1a9ac49..993813ccdc0b6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2525,7 +2525,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
 	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
 
 	if (a == &dev_attr_uuid.attr) {
-		if (uuid_is_null(&ids->uuid) ||
+		if (uuid_is_null(&ids->uuid) &&
 		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
 			return 0;
 	}
-- 
cgit v1.2.3