From 28b89b9e6f7b6c8fef7b3af39828722bca20cfee Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Sun, 11 Sep 2016 21:14:58 -0700
Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work

A discrepancy between cpu_online_mask and cpuset's effective_cpus
mask is inevitable during hotplug since cpuset defers updating of
effective_cpus mask using a workqueue, during which time nothing
prevents the system from more hotplug operations.  For that reason
guarantee_online_cpus() walks up the cpuset hierarchy until it finds
an intersection under the assumption that top cpuset's effective_cpus
mask intersects with cpu_online_mask even with such a race occurring.

However a sequence of CPU hotplugs can open a time window, during which
none of the effective CPUs in the top cpuset intersect with
cpu_online_mask.

For example when there are 4 possible CPUs 0-3 and only CPU0 is online:

  ========================  ===========================
   cpu_online_mask           top_cpuset.effective_cpus
  ========================  ===========================
   echo 1 > cpu2/online.
   CPU hotplug notifier woke up hotplug work but not yet scheduled.
      [0,2]                     [0]

   echo 0 > cpu0/online.
   The workqueue is still runnable.
      [2]                       [0]
  ========================  ===========================

  Now there is no intersection between cpu_online_mask and
  top_cpuset.effective_cpus.  Thus invoking sys_sched_setaffinity() at
  this moment can cause following:

   Unable to handle kernel NULL pointer dereference at virtual address 000000d0
   ------------[ cut here ]------------
   Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable]
   Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP
   Modules linked in:
   CPU: 2 PID: 1420 Comm: taskset Tainted: G        W       4.4.8+ #98
   task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000
   PC is at guarantee_online_cpus+0x2c/0x58
   LR is at cpuset_cpus_allowed+0x4c/0x6c
   <snip>
   Process taskset (pid: 1420, stack limit = 0xffffffc06e124020)
   Call trace:
   [<ffffffc0001389b0>] guarantee_online_cpus+0x2c/0x58
   [<ffffffc00013b208>] cpuset_cpus_allowed+0x4c/0x6c
   [<ffffffc0000d61f0>] sched_setaffinity+0xc0/0x1ac
   [<ffffffc0000d6374>] SyS_sched_setaffinity+0x98/0xac
   [<ffffffc000085cb0>] el0_svc_naked+0x24/0x28

The top cpuset's effective_cpus are guaranteed to be identical to
cpu_online_mask eventually.  Hence fall back to cpu_online_mask when
there is no intersection between top cpuset's effective_cpus and
cpu_online_mask.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: cgroups@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.17+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c27e53326befe..c08585ad996be 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
 /*
  * Return in pmask the portion of a cpusets's cpus_allowed that
  * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.  The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
  *
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_mask.
@@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
  */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
 		cs = parent_cs(cs);
+		if (unlikely(!cs)) {
+			/*
+			 * The top cpuset doesn't have any online cpu as a
+			 * consequence of a race between cpuset_hotplug_work
+			 * and cpu hotplug notifier.  But we know the top
+			 * cpuset's effective_cpus is on its way to to be
+			 * identical to cpu_online_mask.
+			 */
+			cpumask_copy(pmask, cpu_online_mask);
+			return;
+		}
+	}
 	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 
-- 
cgit v1.2.3


From 8a15b81741879fa89601b03c0e50b0d780d65bc0 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Fri, 16 Sep 2016 13:02:37 +0000
Subject: cpuset: fix non static symbol warning

Fixes the following sparse warning:

kernel/cpuset.c:2088:6: warning:
 symbol 'cpuset_fork' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c08585ad996be..2b4c20ab5bbe1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2085,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
  * which could have been changed by cpuset just after it inherits the
  * state from the parent and before it sits on the cgroup's task list.
  */
-void cpuset_fork(struct task_struct *task)
+static void cpuset_fork(struct task_struct *task)
 {
 	if (task_css_is_root(task, cpuset_cgrp_id))
 		return;
-- 
cgit v1.2.3


From 9157056da8f8c4a6305f15619e269f164b63a6de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 23 Sep 2016 16:55:49 -0400
Subject: cgroup: fix invalid controller enable rejections with cgroup
 namespace

On the v2 hierarchy, "cgroup.subtree_control" rejects controller
enables if the cgroup has processes in it.  The enforcement of this
logic assumes that the cgroup wouldn't have any css_sets associated
with it if there are no tasks in the cgroup, which is no longer true
since a79a908fd2b0 ("cgroup: introduce cgroup namespaces").

When a cgroup namespace is created, it pins the css_set of the
creating task to use it as the root css_set of the namespace.  This
extra reference stays as long as the namespace is around and makes
"cgroup.subtree_control" think that the namespace root cgroup is not
empty even when it is and thus reject controller enables.

Fix it by making cgroup_subtree_control() walk and test emptiness of
each css_set instead of testing whether the list_head is empty.

While at it, update the comment of cgroup_task_count() to indicate
that the returned value may be higher than the number of tasks, which
has always been true due to temporary references and doesn't break
anything.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Evgeny Vereshchagin <evvers@ya.ru>
Cc: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Cc: Aditya Kali <adityakali@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: stable@vger.kernel.org # v4.6+
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Link: https://github.com/systemd/systemd/pull/3589#issuecomment-249089541
---
 kernel/cgroup.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7f5221f..0d4ee1ea5c316 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * Except for the root, subtree_control must be zero for a cgroup
 	 * with tasks so that child cgroups don't compete against tasks.
 	 */
-	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
-		ret = -EBUSY;
-		goto out_unlock;
+	if (enable && cgroup_parent(cgrp)) {
+		struct cgrp_cset_link *link;
+
+		/*
+		 * Because namespaces pin csets too, @cgrp->cset_links
+		 * might not be empty even when @cgrp is empty.  Walk and
+		 * verify each cset.
+		 */
+		spin_lock_irq(&css_set_lock);
+
+		ret = 0;
+		list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+			if (css_set_populated(link->cset)) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		spin_unlock_irq(&css_set_lock);
+
+		if (ret)
+			goto out_unlock;
 	}
 
 	/* save and update control masks and prepare csses */
@@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile)
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
- * Return the number of tasks in the cgroup.
+ * Return the number of tasks in the cgroup.  The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
-- 
cgit v1.2.3