From 49f4d8b93ccf9454284b6f524b96c66d8d7fbccc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Aug 2012 04:25:10 -0700 Subject: pidns: Capture the user namespace and filter ns_last_pid - Capture the the user namespace that creates the pid namespace - Use that user namespace to test if it is ok to write to /proc/sys/kernel/ns_last_pid. Zhao Hongjiang noticed I was missing a put_user_ns in when destroying a pid_ns. I have foloded his patch into this one so that bisects will work properly. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index aebd4f5aaf41..2a624f1486e1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -78,6 +78,7 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, }; EXPORT_SYMBOL_GPL(init_pid_ns); -- cgit v1.2.3 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- drivers/staging/android/binder.c | 3 ++- fs/hppfs/hppfs.c | 2 +- fs/proc/root.c | 2 +- kernel/cgroup.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid.c | 8 ++++---- kernel/signal.c | 2 +- kernel/sysctl_binary.c | 2 +- 12 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel/pid.c') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 965d381abd75..25db92a8e1cf 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - current->nsproxy->pid_ns->last_pid); + task_active_pid_ns(current)->last_pid); return 0; } diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 79ccfe6c7078..7fc71c628267 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; char *buf; int len; struct file *file; diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 5d4610babd8a..a97bbcd1c9ea 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "binder.h" @@ -2344,7 +2345,7 @@ retry: if (t->from) { struct task_struct *sender = t->from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, - current->nsproxy->pid_ns); + task_active_pid_ns(current)); } else { tr.sender_pid = 0; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 78f21f8dc2ec..43b315f2002b 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/fs/proc/root.c b/fs/proc/root.c index 13ef6247e7a3..fc1609321a78 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..0dbfba2efa77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3390,7 +3390,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134d..738f3564e83b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..7798c247f4b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + ns_of_pid(pid)->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ca27d2c5264d..acc92680381a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; diff --git a/kernel/pid.c b/kernel/pid.c index 2a624f1486e1..3a5f238c1ca0 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -429,7 +429,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -484,7 +484,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -495,7 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..b2445d86f226 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1752,7 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4e..5a6384450501 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) -- cgit v1.2.3 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- include/linux/pid_namespace.h | 2 ++ kernel/fork.c | 2 -- kernel/pid.c | 21 +++++++++++++++++---- kernel/pid_namespace.c | 14 +++++++------- 6 files changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel/pid.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fdb..7621dc51cff8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a78..f2f251158d35 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c89c9cfcd247..4c96acdb2489 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -32,6 +33,7 @@ struct pid_namespace { struct bsd_acct_struct *bacct; #endif struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b9..666dc8b06606 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca0..e957f8b09136 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -270,8 +271,12 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + hlist_del_rcu(&upid->pid_chain); + if (--upid->ns->nr_hashed == 0) + schedule_work(&upid->ns->proc_work); + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: @@ -570,6 +582,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa50..84591cfeefc1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -72,6 +72,12 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 @@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); - put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: -- cgit v1.2.3 From 5e1182deb81ae8c68494017c4a8a71811659c870 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Jul 2010 18:50:25 -0700 Subject: pidns: Don't allow new processes in a dead pid namespace. Set nr_hashed to -1 just before we schedule the work to cleanup proc. Test nr_hashed just before we hash a new pid and if nr_hashed is < 0 fail. This guaranteees that processes never enter a pid namespaces after we have cleaned up the state to support processes in a pid namespace. Currently sending SIGKILL to all of the process in a pid namespace as init exists gives us this guarantee but we need something a little stronger to support unsharing and joining a pid namespace. Acked-by: "Serge E. Hallyn" Signed-off-by: Eric W. Biederman --- kernel/pid.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/pid.c') diff --git a/kernel/pid.c b/kernel/pid.c index e957f8b09136..9c219117af36 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -274,8 +274,10 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; hlist_del_rcu(&upid->pid_chain); - if (--upid->ns->nr_hashed == 0) + if (--upid->ns->nr_hashed == 0) { + upid->ns->nr_hashed = -1; schedule_work(&upid->ns->proc_work); + } } spin_unlock_irqrestore(&pidmap_lock, flags); @@ -321,6 +323,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); + if (ns->nr_hashed < 0) + goto out_unlock; for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); @@ -331,6 +335,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) out: return pid; +out_unlock: + spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); -- cgit v1.2.3 From af4b8a83add95ef40716401395b44a1b579965f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 15:03:42 -0700 Subject: pidns: Wait in zap_pid_ns_processes until pid_ns->nr_hashed == 1 Looking at pid_ns->nr_hashed is a bit simpler and it works for disjoint process trees that an unshare or a join of a pid_namespace may create. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 12 ------------ kernel/pid.c | 16 +++++++++++++--- kernel/pid_namespace.c | 15 ++++----------- 3 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel/pid.c') diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..d7fe58db4527 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } diff --git a/kernel/pid.c b/kernel/pid.c index 9c219117af36..6e8da291de49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -273,10 +273,20 @@ void free_pid(struct pid *pid) spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - if (--upid->ns->nr_hashed == 0) { - upid->ns->nr_hashed = -1; - schedule_work(&upid->ns->proc_work); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + ns->nr_hashed = -1; + schedule_work(&ns->proc_work); + break; } } spin_unlock_irqrestore(&pidmap_lock, flags); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 84591cfeefc1..3cc29b830e9e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -217,22 +217,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; -- cgit v1.2.3 From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 10:21:48 -0700 Subject: proc: Usable inode numbers for the namespace file descriptors. Assign a unique proc inode to each namespace, and use that inode number to ensure we only allocate at most one proc inode for every namespace in proc. A single proc inode per namespace allows userspace to test to see if two processes are in the same namespace. This has been a long requested feature and only blocked because a naive implementation would put the id in a global space and would ultimately require having a namespace for the names of namespaces, making migration and certain virtualization tricks impossible. We still don't have per superblock inode numbers for proc, which appears necessary for application unaware checkpoint/restart and migrations (if the application is using namespace file descriptors) but that is now allowd by the design if it becomes important. I have preallocated the ipc and uts initial proc inode numbers so their structures can be statically initialized. Signed-off-by: Eric W. Biederman --- fs/mount.h | 1 + fs/namespace.c | 14 ++++++++++++++ fs/proc/namespaces.c | 24 ++++++++++++++---------- include/linux/ipc_namespace.h | 2 ++ include/linux/pid_namespace.h | 1 + include/linux/proc_fs.h | 7 ++++++- include/linux/user_namespace.h | 1 + include/linux/utsname.h | 1 + include/net/net_namespace.h | 2 ++ init/version.c | 2 ++ ipc/msgutil.c | 2 ++ ipc/namespace.c | 16 ++++++++++++++++ kernel/pid.c | 1 + kernel/pid_namespace.c | 12 ++++++++++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 15 +++++++++++++++ kernel/utsname.c | 17 ++++++++++++++++- net/core/net_namespace.c | 24 ++++++++++++++++++++++++ 18 files changed, 132 insertions(+), 12 deletions(-) (limited to 'kernel/pid.c') diff --git a/fs/mount.h b/fs/mount.h index 630fafc616bb..cd5007980400 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -4,6 +4,7 @@ struct mnt_namespace { atomic_t count; + unsigned int proc_inum; struct mount * root; struct list_head list; struct user_namespace *user_ns; diff --git a/fs/namespace.c b/fs/namespace.c index cab78a74aca3..c1bbe86f4920 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2301,6 +2301,7 @@ dput_out: static void free_mnt_ns(struct mnt_namespace *ns) { + proc_free_inum(ns->proc_inum); put_user_ns(ns->user_ns); kfree(ns); } @@ -2317,10 +2318,16 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; @@ -2799,10 +2806,17 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .inum = mntns_inum, }; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 7a6d8d69cdb8..b7a47196c8c3 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, return ERR_PTR(-ENOMEM); } - inode = new_inode(sb); + inode = iget_locked(sb, ns_ops->inum(ns)); if (!inode) { dput(dentry); ns_ops->put(ns); @@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, } ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } d_set_d_op(dentry, &ns_dentry_operations); result = d_instantiate_unique(dentry, inode); @@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ns) goto out_put_task; - snprintf(name, sizeof(name), "%s", ns_ops->name); + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); len = strlen(name); if (len > buflen) len = buflen; - if (copy_to_user(buffer, ns_ops->name, len)) + if (copy_to_user(buffer, name, len)) len = -EFAULT; ns_ops->put(ns); diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index f03af702a39d..fe771978e877 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -67,6 +67,8 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; + + unsigned int proc_inum; }; extern struct ipc_namespace init_ipc_ns; diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 4c96acdb2489..bf285999273a 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -37,6 +37,7 @@ struct pid_namespace { kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ + unsigned int proc_inum; }; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index bf1d000fbba6..2e24018b7cec 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -28,7 +28,11 @@ struct mm_struct; */ enum { - PROC_ROOT_INO = 1, + PROC_ROOT_INO = 1, + PROC_IPC_INIT_INO = 0xEFFFFFFFU, + PROC_UTS_INIT_INO = 0xEFFFFFFEU, + PROC_USER_INIT_INO = 0xEFFFFFFDU, + PROC_PID_INIT_INO = 0xEFFFFFFCU, }; /* @@ -263,6 +267,7 @@ struct proc_ns_operations { void *(*get)(struct task_struct *task); void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); + unsigned int (*inum)(void *ns); }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 17651f08d67f..b9bd2e6c73cc 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -25,6 +25,7 @@ struct user_namespace { struct user_namespace *parent; kuid_t owner; kgid_t group; + unsigned int proc_inum; }; extern struct user_namespace init_user_ns; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 221f4a0a7502..239e27733d6c 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -23,6 +23,7 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; + unsigned int proc_inum; }; extern struct uts_namespace init_uts_ns; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a43f56b796..de644bcd8613 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -56,6 +56,8 @@ struct net { struct user_namespace *user_ns; /* Owning user namespace */ + unsigned int proc_inum; + struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; diff --git a/init/version.c b/init/version.c index 86fe0ccb997a..58170f18912d 100644 --- a/init/version.c +++ b/init/version.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a @@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, + .proc_inum = PROC_UTS_INIT_INO, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 26143d377c95..6471f1bdae96 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "util.h" @@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .count = ATOMIC_INIT(1), .user_ns = &init_user_ns, + .proc_inum = PROC_IPC_INIT_INO, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/ipc/namespace.c b/ipc/namespace.c index 72c868277793..cf3386a51de2 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + atomic_set(&ns->count, 1); err = mq_init_ns(ns); if (err) { + proc_free_inum(ns->proc_inum); kfree(ns); return ERR_PTR(err); } @@ -111,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -172,10 +180,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) return 0; } +static unsigned int ipcns_inum(void *vp) +{ + struct ipc_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, + .inum = ipcns_inum, }; diff --git a/kernel/pid.c b/kernel/pid.c index 6e8da291de49..3026ddae0a34 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -80,6 +80,7 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 68508d330634..560da0dab230 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,6 +107,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); @@ -133,6 +137,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -345,12 +350,19 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9ec..33acb5e53a5f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 89f6eaed067a..f5975ccf9348 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -71,6 +72,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; @@ -103,6 +110,7 @@ void free_user_ns(struct kref *kref) container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -808,12 +816,19 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index fdc619eb61ef..f6336d51d64c 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -36,11 +36,18 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -77,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -114,11 +122,18 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ec2870b44c1f..2e9a3132b8dd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ + return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ + proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { + .init = net_ns_net_init, + .exit = net_ns_net_exit, +}; + static int __init net_ns_init(void) { struct net_generic *ng; @@ -412,6 +427,8 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); + register_pernet_subsys(&net_ns_ops); + return 0; } @@ -640,11 +657,18 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int netns_inum(void *ns) +{ + struct net *net = ns; + return net->proc_inum; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .inum = netns_inum, }; #endif -- cgit v1.2.3