From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 May 2016 14:51:59 -0500 Subject: vfs: Pass data, ns, and ns->userns to mount_ns Today what is normally called data (the mount options) is not passed to fill_super through mount_ns. Pass the mount options and the namespace separately to mount_ns so that filesystems such as proc that have mount options, can use mount_ns. Pass the user namespace to mount_ns so that the standard permission check that verifies the mounter has permissions over the namespace can be performed in mount_ns instead of in each filesystems .mount method. Thus removing the duplication between mqueuefs and proc in terms of permission checks. The extra permission check does not currently affect the rpc_pipefs filesystem and the nfsd filesystem as those filesystems do not currently allow unprivileged mounts. Without unpvileged mounts it is guaranteed that the caller has already passed capable(CAP_SYS_ADMIN) which guarantees extra permission check will pass. Update rpc_pipefs and the nfsd filesystem to ensure that the network namespace reference is always taken in fill_super and always put in kill_sb so that the logic is simpler and so that errors originating inside of fill_super do not cause a network namespace leak. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/super.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..fd65667832e5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)) +struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); -- cgit v1.2.3 From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 24 May 2016 09:29:01 -0500 Subject: fs: Add user namespace member to struct super_block Start marking filesystems with a user namespace owner, s_user_ns. In this change this is only used for permission checks of who may mount a filesystem. Ultimately s_user_ns will be used for translating ids and checking capabilities for filesystems mounted from user namespaces. The default policy for setting s_user_ns is implemented in sget(), which arranges for s_user_ns to be set to current_user_ns() and to ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that user_ns. The guts of sget are split out into another function sget_userns(). The function sget_userns calls alloc_super with the specified user namespace or it verifies the existing superblock that was found has the expected user namespace, and fails with EBUSY when it is not. This failing prevents users with the wrong privileges mounting a filesystem. The reason for the split of sget_userns from sget is that in some cases such as mount_ns and kernfs_mount_ns a different policy for permission checking of mounts and setting s_user_ns is necessary, and the existence of sget_userns() allows those policies to be implemented. The helper mount_ns is expected to be used for filesystems such as proc and mqueuefs which present per namespace information. The function mount_ns is modified to call sget_userns instead of sget to ensure the user namespace owner of the namespace whose information is presented by the filesystem is used on the superblock. For sysfs and cgroup the appropriate permission checks are already in place, and kernfs_mount_ns is modified to call sget_userns so that the init_user_ns is the only user namespace used. For the cgroup filesystem cgroup namespace mounts are bind mounts of a subset of the full cgroup filesystem and as such s_user_ns must be the same for all of them as there is only a single superblock. Mounts of sysfs that vary based on the network namespace could in principle change s_user_ns but it keeps the analysis and implementation of kernfs simpler if that is not supported, and at present there appear to be no benefits from supporting a different s_user_ns on any sysfs mount. Getting the details of setting s_user_ns correct has been a long process. Thanks to Pavel Tikhorirorv who spotted a leak in sget_userns. Thanks to Seth Forshee who has kept the work alive. Thanks-to: Seth Forshee Thanks-to: Pavel Tikhomirov Acked-by: Seth Forshee Signed-off-by: Eric W. Biederman --- fs/kernfs/mount.c | 3 ++- fs/super.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ include/linux/fs.h | 12 ++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) (limited to 'fs/super.c') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 63534f5f9073..d90d574c15a2 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; - sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, + &init_user_ns, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); if (IS_ERR(sb)) diff --git a/fs/super.c b/fs/super.c index fd65667832e5..874c7e3ebb8f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "internal.h" @@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s) list_lru_destroy(&s->s_inode_lru); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); + put_user_ns(s->s_user_ns); kfree(s->s_subtype); kfree(s->s_options); call_rcu(&s->rcu, destroy_super_rcu); @@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s) * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags + * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type, int flags) +static struct super_block *alloc_super(struct file_system_type *type, int flags, + struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) return NULL; INIT_LIST_HEAD(&s->s_mounts); + s->s_user_ns = get_user_ns(user_ns); if (security_sb_alloc(s)) goto fail; @@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** - * sget - find or create a superblock + * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags + * @user_ns: User namespace for the super_block * @data: argument to each of them */ -struct super_block *sget(struct file_system_type *type, +struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, + int flags, struct user_namespace *user_ns, void *data) { struct super_block *s = NULL; @@ -466,6 +472,14 @@ retry: hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + if (s) { + up_write(&s->s_umount); + destroy_super(s); + } + return ERR_PTR(-EBUSY); + } if (!grab_super(old)) goto retry; if (s) { @@ -478,7 +492,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -501,6 +515,31 @@ retry: return s; } +EXPORT_SYMBOL(sget_userns); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct user_namespace *user_ns = current_user_ns(); + + /* Ensure the requestor has permissions over the target filesystem */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return sget_userns(type, test, set, flags, user_ns, data); +} + EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type, if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); + sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, + user_ns, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ce006a24f49..9eef64f23a75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1432,6 +1432,13 @@ struct super_block { struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + /* * Keep the lru lists last in the structure so they always sit on their * own individual cachelines. @@ -2056,6 +2063,11 @@ void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); +struct super_block *sget_userns(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, struct user_namespace *user_ns, + void *data); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), -- cgit v1.2.3 From a001e74cef34d95ede6535ef521011c612657a3a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 6 Jun 2016 15:48:04 -0500 Subject: mnt: Move the FS_USERNS_MOUNT check into sget_userns Allowing a filesystem to be mounted by other than root in the initial user namespace is a filesystem property not a mount namespace property and as such should be checked in filesystem specific code. Move the FS_USERNS_MOUNT test into super.c:sget_userns(). Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 4 ---- fs/super.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/super.c') diff --git a/fs/namespace.c b/fs/namespace.c index 1a69aa786975..2e13f6cfe5df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2397,10 +2397,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, return -ENODEV; if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } /* Only in special cases allow devices from mounts * created outside the initial user namespace. */ diff --git a/fs/super.c b/fs/super.c index 874c7e3ebb8f..78790ada7191 100644 --- a/fs/super.c +++ b/fs/super.c @@ -466,6 +466,10 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; + if (!(flags & MS_KERNMOUNT) && + !(type->fs_flags & FS_USERNS_MOUNT) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); retry: spin_lock(&sb_lock); if (test) { -- cgit v1.2.3 From 67690f937c38bbab1d94cb45f6a32e61612834ae Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 May 2016 13:50:06 -0500 Subject: userns: Remove implicit MNT_NODEV fragility. Replace the implict setting of MNT_NODEV on mounts that happen with just user namespace permissions with an implicit setting of SB_I_NODEV in s_iflags. The visibility of the implicit MNT_NODEV has caused problems in the past. With this change the fragile case where an implicit MNT_NODEV needs to be preserved in do_remount is removed. Using SB_I_NODEV is much less fragile as s_iflags are set during the original mount and never changed. In do_new_mount with the implicit setting of MNT_NODEV gone, the only code that can affect mnt_flags is fs_fully_visible so simplify the if statement and reduce the indentation of the code to make that clear. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 19 +------------------ fs/super.c | 3 +++ 2 files changed, 4 insertions(+), 18 deletions(-) (limited to 'fs/super.c') diff --git a/fs/namespace.c b/fs/namespace.c index b1da7f8182c4..9786a38d1681 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2185,13 +2185,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - /* Was the nodev implicitly added in mount? */ - if ((mnt->mnt_ns->user_ns != &init_user_ns) && - !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { - mnt_flags |= MNT_NODEV; - } else { - return -EPERM; - } + return -EPERM; } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2385,7 +2379,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct vfsmount *mnt; int err; @@ -2396,16 +2389,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (!type) return -ENODEV; - if (user_ns != &init_user_ns) { - /* Only in special cases allow devices from mounts - * created outside the initial user namespace. - */ - if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { - flags |= MS_NODEV; - mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; - } - } - mnt = vfs_kern_mount(type, flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) diff --git a/fs/super.c b/fs/super.c index 78790ada7191..25cdceed2ad3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,6 +206,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; + if ((s->s_user_ns != &init_user_ns) && + !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); mutex_init(&s->s_sync_lock); -- cgit v1.2.3 From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:44:48 -0500 Subject: userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag Now that SB_I_NODEV controls the nodev behavior devpts can just clear this flag during mount. Simplifying the code and making it easier to audit how the code works. While still preserving the invariant that s_iflags is only modified during mount. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 3 ++- fs/super.c | 3 +-- include/linux/fs.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/super.c') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 37c134a132c7..d116453b0276 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; @@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/super.c b/fs/super.c index 25cdceed2ad3..37813bf479cf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - if ((s->s_user_ns != &init_user_ns) && - !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); diff --git a/include/linux/fs.h b/include/linux/fs.h index e05983170d23..375e37f42cdf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2022,7 +2022,6 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ -#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); -- cgit v1.2.3