From 28b89b9e6f7b6c8fef7b3af39828722bca20cfee Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Sun, 11 Sep 2016 21:14:58 -0700 Subject: cpuset: handle race between CPU hotplug and cpuset_hotplug_work A discrepancy between cpu_online_mask and cpuset's effective_cpus mask is inevitable during hotplug since cpuset defers updating of effective_cpus mask using a workqueue, during which time nothing prevents the system from more hotplug operations. For that reason guarantee_online_cpus() walks up the cpuset hierarchy until it finds an intersection under the assumption that top cpuset's effective_cpus mask intersects with cpu_online_mask even with such a race occurring. However a sequence of CPU hotplugs can open a time window, during which none of the effective CPUs in the top cpuset intersect with cpu_online_mask. For example when there are 4 possible CPUs 0-3 and only CPU0 is online: ======================== =========================== cpu_online_mask top_cpuset.effective_cpus ======================== =========================== echo 1 > cpu2/online. CPU hotplug notifier woke up hotplug work but not yet scheduled. [0,2] [0] echo 0 > cpu0/online. The workqueue is still runnable. [2] [0] ======================== =========================== Now there is no intersection between cpu_online_mask and top_cpuset.effective_cpus. Thus invoking sys_sched_setaffinity() at this moment can cause following: Unable to handle kernel NULL pointer dereference at virtual address 000000d0 ------------[ cut here ]------------ Kernel BUG at ffffffc0001389b0 [verbose debug info unavailable] Internal error: Oops - BUG: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 1420 Comm: taskset Tainted: G W 4.4.8+ #98 task: ffffffc06a5c4880 ti: ffffffc06e124000 task.ti: ffffffc06e124000 PC is at guarantee_online_cpus+0x2c/0x58 LR is at cpuset_cpus_allowed+0x4c/0x6c Process taskset (pid: 1420, stack limit = 0xffffffc06e124020) Call trace: [] guarantee_online_cpus+0x2c/0x58 [] cpuset_cpus_allowed+0x4c/0x6c [] sched_setaffinity+0xc0/0x1ac [] SyS_sched_setaffinity+0x98/0xac [] el0_svc_naked+0x24/0x28 The top cpuset's effective_cpus are guaranteed to be identical to cpu_online_mask eventually. Hence fall back to cpu_online_mask when there is no intersection between top cpuset's effective_cpus and cpu_online_mask. Signed-off-by: Joonwoo Park Acked-by: Li Zefan Cc: Tejun Heo Cc: cgroups@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 3.17+ Signed-off-by: Tejun Heo --- kernel/cpuset.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c27e53326befe..c08585ad996be 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } -- cgit v1.2.3 From 8a15b81741879fa89601b03c0e50b0d780d65bc0 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 16 Sep 2016 13:02:37 +0000 Subject: cpuset: fix non static symbol warning Fixes the following sparse warning: kernel/cpuset.c:2088:6: warning: symbol 'cpuset_fork' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c08585ad996be..2b4c20ab5bbe1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2085,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ -void cpuset_fork(struct task_struct *task) +static void cpuset_fork(struct task_struct *task) { if (task_css_is_root(task, cpuset_cgrp_id)) return; -- cgit v1.2.3 From 9157056da8f8c4a6305f15619e269f164b63a6de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 23 Sep 2016 16:55:49 -0400 Subject: cgroup: fix invalid controller enable rejections with cgroup namespace On the v2 hierarchy, "cgroup.subtree_control" rejects controller enables if the cgroup has processes in it. The enforcement of this logic assumes that the cgroup wouldn't have any css_sets associated with it if there are no tasks in the cgroup, which is no longer true since a79a908fd2b0 ("cgroup: introduce cgroup namespaces"). When a cgroup namespace is created, it pins the css_set of the creating task to use it as the root css_set of the namespace. This extra reference stays as long as the namespace is around and makes "cgroup.subtree_control" think that the namespace root cgroup is not empty even when it is and thus reject controller enables. Fix it by making cgroup_subtree_control() walk and test emptiness of each css_set instead of testing whether the list_head is empty. While at it, update the comment of cgroup_task_count() to indicate that the returned value may be higher than the number of tasks, which has always been true due to temporary references and doesn't break anything. Signed-off-by: Tejun Heo Reported-by: Evgeny Vereshchagin Cc: Serge E. Hallyn Cc: Aditya Kali Cc: Eric W. Biederman Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: https://github.com/systemd/systemd/pull/3589#issuecomment-249089541 --- kernel/cgroup.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221f..0d4ee1ea5c316 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { -- cgit v1.2.3