[PATCH] get rid of full-hash scan on detaching vfsmounts
Andy Whitcroft
apw at canonical.com
Fri Oct 4 15:56:59 UTC 2013
On Thu, Oct 03, 2013 at 03:59:29PM -0500, Chris J Arges wrote:
> From: Al Viro <viro at zeniv.linux.org.uk>
>
> BugLink: http://bugs.launchpad.net/bugs/1226726
>
> Signed-off-by: Al Viro <viro at zeniv.linux.org.uk>
> (backported from commit 84d17192d2afd52aeba88c71ae4959a015f56a38 upstream)
> Signed-off-by: Dave Chiluk <chiluk at canonical.com>
> Signed-off-by: Chris J Arges <chris.j.arges at canonical.com>
> ---
> fs/mount.h | 7 ++
> fs/namespace.c | 235 ++++++++++++++++++++++++++++++++++----------------------
> fs/pnode.c | 6 +-
> fs/pnode.h | 4 +-
> 4 files changed, 155 insertions(+), 97 deletions(-)
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 4ef36d9..3fc65ed 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -15,6 +15,12 @@ struct mnt_pcp {
> int mnt_writers;
> };
>
> +struct mountpoint {
> + struct list_head m_hash;
> + struct dentry *m_dentry;
> + int m_count;
> +};
> +
> struct mount {
> struct list_head mnt_hash;
> struct mount *mnt_parent;
> @@ -38,6 +44,7 @@ struct mount {
> struct list_head mnt_slave; /* slave list entry */
> struct mount *mnt_master; /* slave is on master->mnt_slave_list */
> struct mnt_namespace *mnt_ns; /* containing namespace */
> + struct mountpoint *mnt_mp; /* where is it mounted */
> #ifdef CONFIG_FSNOTIFY
> struct hlist_head mnt_fsnotify_marks;
> __u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e1e9274..4d8a2de 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -34,6 +34,7 @@ static int mnt_id_start = 0;
> static int mnt_group_start = 1;
>
> static struct list_head *mount_hashtable __read_mostly;
> +static struct list_head *mountpoint_hashtable __read_mostly;
> static struct kmem_cache *mnt_cache __read_mostly;
> static struct rw_semaphore namespace_sem;
>
> @@ -535,6 +536,51 @@ struct vfsmount *lookup_mnt(struct path *path)
> }
> }
>
> +static struct mountpoint *new_mountpoint(struct dentry *dentry)
> +{
> + struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
> + struct mountpoint *mp;
> +
> + list_for_each_entry(mp, chain, m_hash) {
> + if (mp->m_dentry == dentry) {
> + /* might be worth a WARN_ON() */
> + if (d_unlinked(dentry))
> + return ERR_PTR(-ENOENT);
> + mp->m_count++;
> + return mp;
> + }
> + }
> +
> + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
> + if (!mp)
> + return ERR_PTR(-ENOMEM);
> +
> + spin_lock(&dentry->d_lock);
> + if (d_unlinked(dentry)) {
> + spin_unlock(&dentry->d_lock);
> + kfree(mp);
> + return ERR_PTR(-ENOENT);
> + }
> + dentry->d_flags |= DCACHE_MOUNTED;
> + spin_unlock(&dentry->d_lock);
> + mp->m_dentry = dentry;
> + mp->m_count = 1;
> + list_add(&mp->m_hash, chain);
> + return mp;
> +}
> +
> +static void put_mountpoint(struct mountpoint *mp)
> +{
> + if (!--mp->m_count) {
> + struct dentry *dentry = mp->m_dentry;
> + spin_lock(&dentry->d_lock);
> + dentry->d_flags &= ~DCACHE_MOUNTED;
> + spin_unlock(&dentry->d_lock);
> + list_del(&mp->m_hash);
> + kfree(mp);
> + }
> +}
> +
> static inline int check_mnt(struct mount *mnt)
> {
> return mnt->mnt_ns == current->nsproxy->mnt_ns;
> @@ -563,27 +609,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
> }
>
> /*
> - * Clear dentry's mounted state if it has no remaining mounts.
> - * vfsmount_lock must be held for write.
> - */
> -static void dentry_reset_mounted(struct dentry *dentry)
> -{
> - unsigned u;
> -
> - for (u = 0; u < HASH_SIZE; u++) {
> - struct mount *p;
> -
> - list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
> - if (p->mnt_mountpoint == dentry)
> - return;
> - }
> - }
> - spin_lock(&dentry->d_lock);
> - dentry->d_flags &= ~DCACHE_MOUNTED;
> - spin_unlock(&dentry->d_lock);
> -}
> -
> -/*
> * vfsmount lock must be held for write
> */
> static void detach_mnt(struct mount *mnt, struct path *old_path)
> @@ -594,32 +619,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
> mnt->mnt_mountpoint = mnt->mnt.mnt_root;
> list_del_init(&mnt->mnt_child);
> list_del_init(&mnt->mnt_hash);
> - dentry_reset_mounted(old_path->dentry);
> + put_mountpoint(mnt->mnt_mp);
> + mnt->mnt_mp = NULL;
> }
>
> /*
> * vfsmount lock must be held for write
> */
> -void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
> +void mnt_set_mountpoint(struct mount *mnt,
> + struct mountpoint *mp,
> struct mount *child_mnt)
> {
> + mp->m_count++;
> mnt_add_count(mnt, 1); /* essentially, that's mntget */
> - child_mnt->mnt_mountpoint = dget(dentry);
> + child_mnt->mnt_mountpoint = dget(mp->m_dentry);
> child_mnt->mnt_parent = mnt;
> - spin_lock(&dentry->d_lock);
> - dentry->d_flags |= DCACHE_MOUNTED;
> - spin_unlock(&dentry->d_lock);
> + child_mnt->mnt_mp = mp;
> }
>
> /*
> * vfsmount lock must be held for write
> */
> -static void attach_mnt(struct mount *mnt, struct path *path)
> +static void attach_mnt(struct mount *mnt,
> + struct mount *parent,
> + struct mountpoint *mp)
> {
> - mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
> + mnt_set_mountpoint(parent, mp, mnt);
> list_add_tail(&mnt->mnt_hash, mount_hashtable +
> - hash(path->mnt, path->dentry));
> - list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
> + hash(&parent->mnt, mp->m_dentry));
> + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> }
>
> static inline void __mnt_make_longterm(struct mount *mnt)
> @@ -1081,7 +1109,8 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
> list_del_init(&p->mnt_child);
> if (mnt_has_parent(p)) {
> p->mnt_parent->mnt_ghosts++;
> - dentry_reset_mounted(p->mnt_mountpoint);
> + put_mountpoint(p->mnt_mp);
> + p->mnt_mp = NULL;
> }
> change_mnt_propagation(p, MS_PRIVATE);
> }
> @@ -1257,8 +1286,7 @@ static int mount_is_safe(struct path *path)
> struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
> int flag)
> {
> - struct mount *res, *p, *q, *r;
> - struct path path;
> + struct mount *res, *p, *q, *r, *parent;
>
> if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
> return NULL;
> @@ -1284,14 +1312,13 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
> q = q->mnt_parent;
> }
> p = s;
> - path.mnt = &q->mnt;
> - path.dentry = p->mnt_mountpoint;
> + parent = q;
> q = clone_mnt(p, p->mnt.mnt_root, flag);
> if (!q)
> goto Enomem;
> br_write_lock(&vfsmount_lock);
> list_add_tail(&q->mnt_list, &res->mnt_list);
> - attach_mnt(q, &path);
> + attach_mnt(q, parent, p->mnt_mp);
> br_write_unlock(&vfsmount_lock);
> }
> }
> @@ -1453,11 +1480,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
> * in allocations.
> */
> static int attach_recursive_mnt(struct mount *source_mnt,
> - struct path *path, struct path *parent_path)
> + struct mount *dest_mnt,
> + struct mountpoint *dest_mp,
> + struct path *parent_path)
> {
> LIST_HEAD(tree_list);
> - struct mount *dest_mnt = real_mount(path->mnt);
> - struct dentry *dest_dentry = path->dentry;
> struct mount *child, *p;
> int err;
>
> @@ -1466,7 +1493,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> if (err)
> goto out;
> }
> - err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
> + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
> if (err)
> goto out_cleanup_ids;
>
> @@ -1478,10 +1505,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> }
> if (parent_path) {
> detach_mnt(source_mnt, parent_path);
> - attach_mnt(source_mnt, path);
> + attach_mnt(source_mnt, dest_mnt, dest_mp);
> touch_mnt_namespace(source_mnt->mnt_ns);
> } else {
> - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
> + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
> commit_tree(source_mnt);
> }
>
> @@ -1500,46 +1527,53 @@ static int attach_recursive_mnt(struct mount *source_mnt,
> return err;
> }
>
> -static int lock_mount(struct path *path)
> +static struct mountpoint *lock_mount(struct path *path)
> {
> struct vfsmount *mnt;
> + struct dentry *dentry = path->dentry;
> retry:
> - mutex_lock(&path->dentry->d_inode->i_mutex);
> - if (unlikely(cant_mount(path->dentry))) {
> - mutex_unlock(&path->dentry->d_inode->i_mutex);
> - return -ENOENT;
> + mutex_lock(&dentry->d_inode->i_mutex);
> + if (unlikely(cant_mount(dentry))) {
> + mutex_unlock(&dentry->d_inode->i_mutex);
> + return ERR_PTR(-ENOENT);
> }
> down_write(&namespace_sem);
> mnt = lookup_mnt(path);
> - if (likely(!mnt))
> - return 0;
> + if (likely(!mnt)) {
> + struct mountpoint *mp = new_mountpoint(dentry);
> + if (IS_ERR(mp)) {
> + up_write(&namespace_sem);
> + mutex_unlock(&dentry->d_inode->i_mutex);
> + return mp;
> + }
> + return mp;
> + }
> up_write(&namespace_sem);
> mutex_unlock(&path->dentry->d_inode->i_mutex);
> path_put(path);
> path->mnt = mnt;
> - path->dentry = dget(mnt->mnt_root);
> + dentry = path->dentry = dget(mnt->mnt_root);
> goto retry;
> }
>
> -static void unlock_mount(struct path *path)
> +static void unlock_mount(struct mountpoint *where)
> {
> + struct dentry *dentry = where->m_dentry;
> + put_mountpoint(where);
> up_write(&namespace_sem);
> - mutex_unlock(&path->dentry->d_inode->i_mutex);
> + mutex_unlock(&dentry->d_inode->i_mutex);
> }
>
> -static int graft_tree(struct mount *mnt, struct path *path)
> +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
> {
> if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
> return -EINVAL;
>
> - if (S_ISDIR(path->dentry->d_inode->i_mode) !=
> + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
> S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
> return -ENOTDIR;
>
> - if (d_unlinked(path->dentry))
> - return -ENOENT;
> -
> - return attach_recursive_mnt(mnt, path, NULL);
> + return attach_recursive_mnt(mnt, p, mp, NULL);
> }
>
> /*
> @@ -1605,7 +1639,8 @@ static int do_loopback(struct path *path, char *old_name,
> {
> LIST_HEAD(umount_list);
> struct path old_path;
> - struct mount *mnt = NULL, *old;
> + struct mount *mnt = NULL, *old, *parent;
> + struct mountpoint *mp;
> int err = mount_is_safe(path);
> if (err)
> return err;
> @@ -1615,17 +1650,19 @@ static int do_loopback(struct path *path, char *old_name,
> if (err)
> return err;
>
> - err = lock_mount(path);
> - if (err)
> + mp = lock_mount(path);
> + err = PTR_ERR(mp);
> + if (IS_ERR(mp))
> goto out;
>
> old = real_mount(old_path.mnt);
> + parent = real_mount(path->mnt);
>
> err = -EINVAL;
> if (IS_MNT_UNBINDABLE(old))
> goto out2;
>
> - if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
> + if (!check_mnt(parent) || !check_mnt(old))
> goto out2;
>
> err = -ENOMEM;
> @@ -1637,14 +1674,14 @@ static int do_loopback(struct path *path, char *old_name,
> if (!mnt)
> goto out2;
>
> - err = graft_tree(mnt, path);
> + err = graft_tree(mnt, parent, mp);
> if (err) {
> br_write_lock(&vfsmount_lock);
> umount_tree(mnt, 0, &umount_list);
> br_write_unlock(&vfsmount_lock);
> }
> out2:
> - unlock_mount(path);
> + unlock_mount(mp);
> release_mounts(&umount_list);
> out:
> path_put(&old_path);
> @@ -1728,6 +1765,7 @@ static int do_move_mount(struct path *path, char *old_name)
> struct path old_path, parent_path;
> struct mount *p;
> struct mount *old;
> + struct mountpoint *mp;
> int err = 0;
> if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> @@ -1737,8 +1775,9 @@ static int do_move_mount(struct path *path, char *old_name)
> if (err)
> return err;
>
> - err = lock_mount(path);
> - if (err < 0)
> + mp = lock_mount(path);
> + err = PTR_ERR(mp);
> + if (IS_ERR(mp))
> goto out;
>
> old = real_mount(old_path.mnt);
> @@ -1748,9 +1787,6 @@ static int do_move_mount(struct path *path, char *old_name)
> if (!check_mnt(p) || !check_mnt(old))
> goto out1;
>
> - if (d_unlinked(path->dentry))
> - goto out1;
> -
> err = -EINVAL;
> if (old_path.dentry != old_path.mnt->mnt_root)
> goto out1;
> @@ -1777,7 +1813,7 @@ static int do_move_mount(struct path *path, char *old_name)
> if (p == old)
> goto out1;
>
> - err = attach_recursive_mnt(old, path, &parent_path);
> + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
> if (err)
> goto out1;
>
> @@ -1785,7 +1821,7 @@ static int do_move_mount(struct path *path, char *old_name)
> * automatically */
> list_del_init(&old->mnt_expire);
> out1:
> - unlock_mount(path);
> + unlock_mount(mp);
> out:
> if (!err)
> path_put(&parent_path);
> @@ -1836,17 +1872,26 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
> */
> static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
> {
> + struct mountpoint *mp;
> + struct mount *parent;
> int err;
>
> mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
>
> - err = lock_mount(path);
> - if (err)
> - return err;
> + mp = lock_mount(path);
> + if (IS_ERR(mp))
> + return PTR_ERR(mp);
>
> + parent = real_mount(path->mnt);
> err = -EINVAL;
> - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
> - goto unlock;
> + if (unlikely(!check_mnt(parent))) {
> + /* that's acceptable only for automounts done in private ns */
> + if (!(mnt_flags & MNT_SHRINKABLE))
> + goto unlock;
> + /* ... and for those we'd better have mountpoint still alive */
> + if (!parent->mnt_ns)
> + goto unlock;
> + }
>
> /* Refuse the same filesystem on the same mount point */
> err = -EBUSY;
> @@ -1859,10 +1904,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
> goto unlock;
>
> newmnt->mnt.mnt_flags = mnt_flags;
> - err = graft_tree(newmnt, path);
> + err = graft_tree(newmnt, parent, mp);
>
> unlock:
> - unlock_mount(path);
> + unlock_mount(mp);
> return err;
> }
>
> @@ -2474,7 +2519,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> const char __user *, put_old)
> {
> struct path new, old, parent_path, root_parent, root;
> - struct mount *new_mnt, *root_mnt;
> + struct mount *new_mnt, *root_mnt, *old_mnt;
> + struct mountpoint *old_mp, *root_mp;
> int error;
>
> if (!capable(CAP_SYS_ADMIN))
> @@ -2493,14 +2539,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> goto out2;
>
> get_fs_root(current->fs, &root);
> - error = lock_mount(&old);
> - if (error)
> + old_mp = lock_mount(&old);
> + error = PTR_ERR(old_mp);
> + if (IS_ERR(old_mp))
> goto out3;
>
> error = -EINVAL;
> new_mnt = real_mount(new.mnt);
> root_mnt = real_mount(root.mnt);
> - if (IS_MNT_SHARED(real_mount(old.mnt)) ||
> + old_mnt = real_mount(old.mnt);
> + if (IS_MNT_SHARED(old_mnt) ||
> IS_MNT_SHARED(new_mnt->mnt_parent) ||
> IS_MNT_SHARED(root_mnt->mnt_parent))
> goto out4;
> @@ -2509,37 +2557,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> error = -ENOENT;
> if (d_unlinked(new.dentry))
> goto out4;
> - if (d_unlinked(old.dentry))
> - goto out4;
> error = -EBUSY;
> - if (new.mnt == root.mnt ||
> - old.mnt == root.mnt)
> + if (new_mnt == root_mnt || old_mnt == root_mnt)
> goto out4; /* loop, on the same file system */
> error = -EINVAL;
> if (root.mnt->mnt_root != root.dentry)
> goto out4; /* not a mountpoint */
> if (!mnt_has_parent(root_mnt))
> goto out4; /* not attached */
> + root_mp = root_mnt->mnt_mp;
> if (new.mnt->mnt_root != new.dentry)
> goto out4; /* not a mountpoint */
> if (!mnt_has_parent(new_mnt))
> goto out4; /* not attached */
> /* make sure we can reach put_old from new_root */
> - if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
> + if (!is_path_reachable(old_mnt, old.dentry, &new))
> goto out4;
> + root_mp->m_count++; /* pin it so it won't go away */
> br_write_lock(&vfsmount_lock);
> detach_mnt(new_mnt, &parent_path);
> detach_mnt(root_mnt, &root_parent);
> /* mount old root on put_old */
> - attach_mnt(root_mnt, &old);
> + attach_mnt(root_mnt, old_mnt, old_mp);
> /* mount new_root on / */
> - attach_mnt(new_mnt, &root_parent);
> + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
> touch_mnt_namespace(current->nsproxy->mnt_ns);
> br_write_unlock(&vfsmount_lock);
> chroot_fs_refs(&root, &new);
> + put_mountpoint(root_mp);
> error = 0;
> out4:
> - unlock_mount(&old);
> + unlock_mount(old_mp);
> if (!error) {
> path_put(&root_parent);
> path_put(&parent_path);
> @@ -2589,14 +2637,17 @@ void __init mnt_init(void)
> 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
>
> mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
> + mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
>
> - if (!mount_hashtable)
> + if (!mount_hashtable || !mountpoint_hashtable)
> panic("Failed to allocate mount hash table\n");
>
> printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
>
> for (u = 0; u < HASH_SIZE; u++)
> INIT_LIST_HEAD(&mount_hashtable[u]);
> + for (u = 0; u < HASH_SIZE; u++)
> + INIT_LIST_HEAD(&mountpoint_hashtable[u]);
>
> br_lock_init(&vfsmount_lock);
>
> diff --git a/fs/pnode.c b/fs/pnode.c
> index bed378d..ed312e7 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -217,7 +217,7 @@ static struct mount *get_source(struct mount *dest,
> * @source_mnt: source mount.
> * @tree_list : list of heads of trees to be attached.
> */
> -int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
> +int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
> struct mount *source_mnt, struct list_head *tree_list)
> {
> struct mount *m, *child;
> @@ -243,8 +243,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
> goto out;
> }
>
> - if (is_subdir(dest_dentry, m->mnt.mnt_root)) {
> - mnt_set_mountpoint(m, dest_dentry, child);
> + if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
> + mnt_set_mountpoint(m, dest_mp, child);
> list_add_tail(&child->mnt_hash, tree_list);
> } else {
> /*
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 65c6097..dedf61e 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -30,14 +30,14 @@ static inline void set_mnt_shared(struct mount *mnt)
> }
>
> void change_mnt_propagation(struct mount *, int);
> -int propagate_mnt(struct mount *, struct dentry *, struct mount *,
> +int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
> struct list_head *);
> int propagate_umount(struct list_head *);
> int propagate_mount_busy(struct mount *, int);
> void mnt_release_group_id(struct mount *);
> int get_dominating_id(struct mount *mnt, const struct path *root);
> unsigned int mnt_get_count(struct mount *mnt);
> -void mnt_set_mountpoint(struct mount *, struct dentry *,
> +void mnt_set_mountpoint(struct mount *, struct mountpoint *,
> struct mount *);
> void release_mounts(struct list_head *);
> void umount_tree(struct mount *, int, struct list_head *);
Ye gads this is huge. If we can test this into the ground then it seems
reasonable.
Acked-by: Andy Whitcroft <apw at canonical.com>
-apw
More information about the kernel-team
mailing list