[PATCH 2/4][DISCO] shiftfs: rework and extend

Christian Brauner christian at brauner.io
Thu Apr 4 13:24:26 UTC 2019


On Wed, Apr 03, 2019 at 10:15:13PM -0500, Tyler Hicks wrote:
> On 2019-03-27 15:11:26, Christian Brauner wrote:
> > From: Christian Brauner <christian at brauner.io>
> > 
> > /* Introduction */
> > The shiftfs filesystem is implemented as a stacking filesystem. Since it is
> > a stacking filesystem it shares concepts with overlayfs and ecryptfs.
> > Usually, shiftfs will be stacked upon another filesystem. The filesystem on
> > top - shiftfs - is referred to as "upper filesystem" or "overlay" and the
> > filesystem it is stacked upon is referred to as "lower filesystem" or
> > "underlay".
> > 
> > /* Marked and Unmarked shiftfs mounts */
> > To use shiftfs it is necessary that a given mount is marked as shiftable via
> > the "mark" mount option. Any mount of shiftfs without the "mark" mount option
> > not on top of a shiftfs mount with the "mark" mount option will be refused with
> > EPERM.
> > After a marked shiftfs mount has been performed other shiftfs mounts
> > referencing the marked shiftfs mount can be created. These secondary shiftfs
> > mounts are usually what are of interest.
> > The marked shiftfs mount will take a reference to the underlying mountpoint of
> > the directory it is marking as shiftable. Any unmarked shiftfts mounts
> > referencing this marked shifts mount will take a second reference to this
> > directory as well. This ensures that the underlying marked shiftfs mount can be
> > unmounted thereby dropping the reference to the underlying directory without
> > invalidating the mountpoint of said directory since the non-marked shiftfs
> > mount still holds another reference to it.
> > 
> > /* Stacking Depth */
> > Shiftfs tries to keep the stack as flat as possible to avoid hitting the
> > kernel enforced filesystem stacking limit.
> > 
> > /* Permission Model */
> > When the mark shiftfs mount is created shiftfs will record the credentials of
> > the creator of the super block and stash it in the super block. When other
> > non-mark shiftfs mounts are created that reference the mark shiftfs mount they
> > will stash another reference to the creators credentials. Before calling into
> > the underlying filesystem shiftfs will switch to the creators credentials and
> > revert to the original credentials after the underlying filesystem operation
> > returns.
> > 
> > /* Mount Options */
> > - mark
> >   When set the mark mount option indicates that the mount in question is
> >   allowed to be shifted. Since shiftfs it mountable in by user namespace root
> >   non-initial user namespace this mount options ensures that the system
> >   administrator has decided that the marked mount is safe to be shifted.
> >   To mark a mount as shiftable CAP_SYS_ADMIN in the user namespace is required.
> > - passthrough={0,1,2,3}
> >   This mount options functions as a bitmask. When set to a non-zero value
> >   shiftfs will try to act as an invisible shim sitting on top of the
> >   underlying filesystem.
> >   - 1: Shifts will report the filesystem type of the underlay for stat-like
> >        system calls.
> >   - 2: Shiftfs will passthrough whitelisted ioctl() to the underlay.
> >   - 3: Shiftfs will both use 1 and 2.
> > Note that mount options on a marked mount cannot be changed.
> > 
> > /* Extended Attributes */
> > Shiftfs will make sure to translate extended attributes.
> > 
> > /* Inodes Numbers */
> > Shiftfs inodes numbers are copied up from the underlying filesystem, i.e.
> > shiftfs inode numbers will be identical to the corresponding underlying
> > filesystem's inode numbers. This has the advantage that inotify and friends
> > should work out of the box.
> > (In essence, shiftfs is nothing but a 1:1 mirror of the underlying filesystem's
> >  dentries and inodes.)
> > 
> > /* Device Support */
> > Shiftfs only supports the creation of pipe and socket devices. Character and
> > block devices cannot be created through shiftfs.
> > 
> > Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
> > Signed-off-by: Seth Forshee <seth.forshee at canonical.com>
> > ---
> >  fs/Kconfig   |   10 +
> >  fs/shiftfs.c | 1847 ++++++++++++++++++++++++++++++++++++++++----------
> >  2 files changed, 1488 insertions(+), 369 deletions(-)
> > 
> > diff --git a/fs/Kconfig b/fs/Kconfig
> > index 22e2e0feba0c..7b9ba073d403 100644
> > --- a/fs/Kconfig
> > +++ b/fs/Kconfig
> > @@ -121,6 +121,16 @@ config SHIFT_FS
> >  	  unprivileged containers can use this to mount root volumes
> >  	  using this technique.
> >  
> > +config SHIFT_FS_POSIX_ACL
> > +	bool "shiftfs POSIX Access Control Lists"
> > +	depends on SHIFT_FS
> > +	select FS_POSIX_ACL
> > +	help
> > +	  POSIX Access Control Lists (ACLs) support permissions for users and
> > +	  groups beyond the owner/group/world scheme.
> > +
> > +	  If you don't know what Access Control Lists are, say N.
> > +
> >  menu "Caches"
> >  
> >  source "fs/fscache/Kconfig"
> > diff --git a/fs/shiftfs.c b/fs/shiftfs.c
> > index f7cada126daa..234af4e31736 100644
> > --- a/fs/shiftfs.c
> > +++ b/fs/shiftfs.c
> > @@ -1,3 +1,4 @@
> > +#include <linux/capability.h>
> >  #include <linux/cred.h>
> >  #include <linux/mount.h>
> >  #include <linux/file.h>
> > @@ -7,83 +8,179 @@
> >  #include <linux/kernel.h>
> >  #include <linux/magic.h>
> >  #include <linux/parser.h>
> > +#include <linux/security.h>
> >  #include <linux/seq_file.h>
> >  #include <linux/statfs.h>
> >  #include <linux/slab.h>
> >  #include <linux/user_namespace.h>
> >  #include <linux/uidgid.h>
> >  #include <linux/xattr.h>
> > +#include <linux/posix_acl.h>
> > +#include <linux/posix_acl_xattr.h>
> > +#include <linux/uio.h>
> >  
> >  struct shiftfs_super_info {
> >  	struct vfsmount *mnt;
> >  	struct user_namespace *userns;
> > +	/* creds of process who created the super block */
> > +	const struct cred *creator_cred;
> >  	bool mark;
> > +	unsigned int passthrough;
> > +	struct shiftfs_super_info *info_mark;
> >  };
> >  
> > -static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
> > -				       struct dentry *dentry);
> > +struct shiftfs_file_info {
> > +	struct path realpath;
> > +	struct file *realfile;
> > +};
> > +
> > +struct kmem_cache *shiftfs_file_info_cache;
> > +
> > +static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
> > +			       umode_t mode, dev_t dev, struct dentry *dentry);
> > +
> > +#define SHIFTFS_PASSTHROUGH_NONE 0
> > +#define SHIFTFS_PASSTHROUGH_STAT 1
> > +#define SHIFTFS_PASSTHROUGH_ALL (SHIFTFS_PASSTHROUGH_STAT)
> > +
> > +static inline bool shiftfs_passthrough_statfs(struct shiftfs_super_info *info)
> > +{
> > +	if (!(info->passthrough & SHIFTFS_PASSTHROUGH_STAT))
> > +		return false;
> > +
> > +	if (info->info_mark &&
> > +	    !(info->info_mark->passthrough & SHIFTFS_PASSTHROUGH_STAT))
> > +		return false;
> > +
> > +	return true;
> > +}
> >  
> >  enum {
> >  	OPT_MARK,
> > +	OPT_PASSTHROUGH,
> >  	OPT_LAST,
> >  };
> >  
> >  /* global filesystem options */
> >  static const match_table_t tokens = {
> >  	{ OPT_MARK, "mark" },
> > +	{ OPT_PASSTHROUGH, "passthrough=%u" },
> >  	{ OPT_LAST, NULL }
> >  };
> >  
> > -static const struct cred *shiftfs_get_up_creds(struct super_block *sb)
> > +static const struct cred *shiftfs_override_creds(const struct super_block *sb)
> >  {
> > -	struct shiftfs_super_info *ssi = sb->s_fs_info;
> > -	struct cred *cred = prepare_creds();
> > +	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
> >  
> > -	if (!cred)
> > -		return NULL;
> > +	return override_creds(sbinfo->creator_cred);
> > +}
> > +
> > +static inline void shiftfs_revert_object_creds(const struct cred *oldcred,
> > +					       struct cred *newcred)
> > +{
> > +	revert_creds(oldcred);
> > +	put_cred(newcred);
> > +}
> > +
> > +static int shiftfs_override_object_creds(const struct super_block *sb,
> > +					 const struct cred **oldcred,
> > +					 struct cred **newcred,
> > +					 struct dentry *dentry, umode_t mode,
> > +					 bool hardlink)
> > +{
> > +	kuid_t fsuid = current_fsuid();
> > +	kgid_t fsgid = current_fsgid();
> > +
> > +	*oldcred = shiftfs_override_creds(sb);
> > +
> > +	*newcred = prepare_creds();
> > +	if (!*newcred) {
> > +		revert_creds(*oldcred);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	(*newcred)->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, fsuid));
> > +	(*newcred)->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, fsgid));
> > +
> > +	if (!hardlink) {
> > +		int err = security_dentry_create_files_as(dentry, mode,
> > +							  &dentry->d_name,
> > +							  *oldcred, *newcred);
> > +		if (err) {
> > +			shiftfs_revert_object_creds(*oldcred, *newcred);
> > +			return err;
> > +		}
> > +	}
> >  
> > -	cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
> > -	cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
> > -	put_user_ns(cred->user_ns);
> > -	cred->user_ns = get_user_ns(ssi->userns);
> > +	put_cred(override_creds(*newcred));
> > +	return 0;
> > +}
> >  
> > -	return cred;
> > +static kuid_t shift_kuid(struct user_namespace *from, struct user_namespace *to,
> > +			 kuid_t kuid)
> > +{
> > +	uid_t uid = from_kuid(from, kuid);
> > +	return make_kuid(to, uid);
> >  }
> >  
> > -static const struct cred *shiftfs_new_creds(const struct cred **newcred,
> > -					    struct super_block *sb)
> > +static kgid_t shift_kgid(struct user_namespace *from, struct user_namespace *to,
> > +			 kgid_t kgid)
> >  {
> > -	const struct cred *cred = shiftfs_get_up_creds(sb);
> > +	gid_t gid = from_kgid(from, kgid);
> > +	return make_kgid(to, gid);
> > +}
> >  
> > -	*newcred = cred;
> > +static void shiftfs_copyattr(struct inode *from, struct inode *to)
> > +{
> > +	struct user_namespace *from_ns = from->i_sb->s_user_ns;
> > +	struct user_namespace *to_ns = to->i_sb->s_user_ns;
> > +
> > +	to->i_uid = shift_kuid(from_ns, to_ns, from->i_uid);
> > +	to->i_gid = shift_kgid(from_ns, to_ns, from->i_gid);
> > +	to->i_mode = from->i_mode;
> > +	to->i_atime = from->i_atime;
> > +	to->i_mtime = from->i_mtime;
> > +	to->i_ctime = from->i_ctime;
> > +	i_size_write(to, i_size_read(from));
> > +}
> >  
> > -	if (cred)
> > -		cred = override_creds(cred);
> > -	else
> > -		printk(KERN_ERR "shiftfs: Credential override failed: no memory\n");
> > +static void shiftfs_copyflags(struct inode *from, struct inode *to)
> > +{
> > +	unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME;
> >  
> > -	return cred;
> > +	inode_set_flags(to, from->i_flags & mask, mask);
> >  }
> >  
> > -static void shiftfs_old_creds(const struct cred *oldcred,
> > -			      const struct cred **newcred)
> > +static void shiftfs_file_accessed(struct file *file)
> >  {
> > -	if (!*newcred)
> > +	struct inode *upperi, *loweri;
> > +
> > +	if (file->f_flags & O_NOATIME)
> >  		return;
> >  
> > -	revert_creds(oldcred);
> > -	put_cred(*newcred);
> > +	upperi = file_inode(file);
> > +	loweri = upperi->i_private;
> > +
> > +	if (!loweri)
> > +		return;
> > +
> > +	upperi->i_mtime = loweri->i_mtime;
> > +	upperi->i_ctime = loweri->i_ctime;
> > +
> > +	touch_atime(&file->f_path);
> >  }
> >  
> > -static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
> > +static int shiftfs_parse_mount_options(struct shiftfs_super_info *sbinfo,
> > +				       char *options)
> >  {
> >  	char *p;
> >  	substring_t args[MAX_OPT_ARGS];
> >  
> > -	ssi->mark = false;
> > +	sbinfo->mark = false;
> > +	sbinfo->passthrough = 0;
> >  
> >  	while ((p = strsep(&options, ",")) != NULL) {
> > -		int token;
> > +		int err, intarg, token;
> >  
> >  		if (!*p)
> >  			continue;
> > @@ -91,121 +188,140 @@ static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
> >  		token = match_token(p, tokens, args);
> >  		switch (token) {
> >  		case OPT_MARK:
> > -			ssi->mark = true;
> > +			sbinfo->mark = true;
> > +			break;
> > +		case OPT_PASSTHROUGH:
> > +			err = match_int(&args[0], &intarg);
> > +			if (err)
> > +				return err;
> > +
> > +			if (intarg & ~SHIFTFS_PASSTHROUGH_ALL)
> > +				return -EINVAL;
> > +
> > +			sbinfo->passthrough = intarg;
> >  			break;
> >  		default:
> >  			return -EINVAL;
> >  		}
> >  	}
> > +
> >  	return 0;
> >  }
> >  
> >  static void shiftfs_d_release(struct dentry *dentry)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  
> > -	dput(real);
> > +	if (lowerd)
> > +		dput(lowerd);
> >  }
> >  
> >  static struct dentry *shiftfs_d_real(struct dentry *dentry,
> >  				     const struct inode *inode)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> > +
> > +	if (inode && d_inode(dentry) == inode)
> > +		return dentry;
> >  
> > -	if (unlikely(real->d_flags & DCACHE_OP_REAL))
> > -		return real->d_op->d_real(real, real->d_inode);
> > +	lowerd = d_real(lowerd, inode);
> > +	if (lowerd && (!inode || inode == d_inode(lowerd)))
> > +		return lowerd;
> >  
> > -	return real;
> > +	WARN(1, "shiftfs_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
> > +	     inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
> > +	return dentry;
> >  }
> >  
> >  static int shiftfs_d_weak_revalidate(struct dentry *dentry, unsigned int flags)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	int err = 1;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  
> > -	if (d_unhashed(real))
> > +	if (d_is_negative(lowerd) != d_is_negative(dentry))
> >  		return 0;
> >  
> > -	if (!(real->d_flags & DCACHE_OP_WEAK_REVALIDATE))
> > -		return 1;
> > +	if ((lowerd->d_flags & DCACHE_OP_WEAK_REVALIDATE))
> > +		err = lowerd->d_op->d_weak_revalidate(lowerd, flags);
> >  
> > -	return real->d_op->d_weak_revalidate(real, flags);
> > +	if (d_really_is_positive(dentry)) {
> > +		struct inode *inode = d_inode(dentry);
> > +		struct inode *loweri = d_inode(lowerd);
> > +
> > +		shiftfs_copyattr(loweri, inode);
> > +		if (!inode->i_nlink)
> > +			err = 0;
> > +	}
> > +
> > +	return err;
> >  }
> >  
> >  static int shiftfs_d_revalidate(struct dentry *dentry, unsigned int flags)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > -	int ret;
> > +	int err = 1;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  
> > -	if (d_unhashed(real))
> > +	if (d_unhashed(lowerd) ||
> > +	    ((d_is_negative(lowerd) != d_is_negative(dentry))))
> >  		return 0;
> >  
> > -	/*
> > -	 * inode state of underlying changed from positive to negative
> > -	 * or vice versa; force a lookup to update our view
> > -	 */
> > -	if (d_is_negative(real) != d_is_negative(dentry))
> > -		return 0;
> > +	if (flags & LOOKUP_RCU)
> > +		return -ECHILD;
> >  
> > -	if (!(real->d_flags & DCACHE_OP_REVALIDATE))
> > -		return 1;
> > +	if ((lowerd->d_flags & DCACHE_OP_REVALIDATE))
> > +		err = lowerd->d_op->d_revalidate(lowerd, flags);
> >  
> > -	ret = real->d_op->d_revalidate(real, flags);
> > +	if (d_really_is_positive(dentry)) {
> > +		struct inode *inode = d_inode(dentry);
> > +		struct inode *loweri = d_inode(lowerd);
> >  
> > -	if (ret == 0 && !(flags & LOOKUP_RCU))
> > -		d_invalidate(real);
> > +		shiftfs_copyattr(loweri, inode);
> > +		if (!inode->i_nlink)
> > +			err = 0;
> > +	}
> >  
> > -	return ret;
> > +	return err;
> >  }
> >  
> >  static const struct dentry_operations shiftfs_dentry_ops = {
> > -	.d_release	= shiftfs_d_release,
> > -	.d_real		= shiftfs_d_real,
> > -	.d_revalidate	= shiftfs_d_revalidate,
> > +	.d_release	   = shiftfs_d_release,
> > +	.d_real		   = shiftfs_d_real,
> > +	.d_revalidate	   = shiftfs_d_revalidate,
> >  	.d_weak_revalidate = shiftfs_d_weak_revalidate,
> >  };
> >  
> > -static int shiftfs_readlink(struct dentry *dentry, char __user *data,
> > -			    int flags)
> > -{
> > -	struct dentry *real = dentry->d_fsdata;
> > -	const struct inode_operations *iop = real->d_inode->i_op;
> > -
> > -	if (iop->readlink)
> > -		return iop->readlink(real, data, flags);
> > -
> > -	return -EINVAL;
> > -}
> > -
> >  static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
> >  				    struct delayed_call *done)
> >  {
> > -	if (dentry) {
> > -		struct dentry *real = dentry->d_fsdata;
> > -		struct inode *reali = real->d_inode;
> > -		const struct inode_operations *iop = reali->i_op;
> > -		const char *res = ERR_PTR(-EPERM);
> > -
> > -		if (iop->get_link)
> > -			res = iop->get_link(real, reali, done);
> > +	const char *p;
> > +	const struct cred *oldcred;
> > +	struct dentry *lowerd;
> >  
> > -		return res;
> > -	} else {
> > -		/* RCU lookup not supported */
> > +	/* RCU lookup not supported */
> > +	if (!dentry)
> >  		return ERR_PTR(-ECHILD);
> > -	}
> > +
> > +	lowerd = dentry->d_fsdata;
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	p = vfs_get_link(lowerd, done);
> > +	revert_creds(oldcred);
> > +
> > +	return p;
> >  }
> >  
> >  static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
> >  			    const char *name, const void *value,
> >  			    size_t size, int flags)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > -	int err = -EOPNOTSUPP;
> > -	const struct cred *oldcred, *newcred;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> > +	int err;
> > +	const struct cred *oldcred;
> > +
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	err = vfs_setxattr(lowerd, name, value, size, flags);
> > +	revert_creds(oldcred);
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	err = vfs_setxattr(real, name, value, size, flags);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	shiftfs_copyattr(lowerd->d_inode, inode);
> >  
> >  	return err;
> >  }
> > @@ -214,13 +330,13 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
> >  			     struct dentry *dentry, struct inode *inode,
> >  			     const char *name, void *value, size_t size)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	err = vfs_getxattr(real, name, value, size);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	err = vfs_getxattr(lowerd, name, value, size);
> > +	revert_creds(oldcred);
> >  
> >  	return err;
> >  }
> > @@ -228,26 +344,29 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
> >  static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
> >  				 size_t size)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	err = vfs_listxattr(real, list, size);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	err = vfs_listxattr(lowerd, list, size);
> > +	revert_creds(oldcred);
> >  
> >  	return err;
> >  }
> >  
> >  static int shiftfs_removexattr(struct dentry *dentry, const char *name)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> > +
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	err = vfs_removexattr(lowerd, name);
> > +	revert_creds(oldcred);
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	err = vfs_removexattr(real, name);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	/* update c/mtime */
> > +	shiftfs_copyattr(lowerd->d_inode, d_inode(dentry));
> >  
> >  	return err;
> >  }
> > @@ -262,93 +381,157 @@ static int shiftfs_xattr_set(const struct xattr_handler *handler,
> >  	return shiftfs_setxattr(dentry, inode, name, value, size, flags);
> >  }
> >  
> > -static void shiftfs_fill_inode(struct inode *inode, struct dentry *dentry)
> > +static int shiftfs_inode_test(struct inode *inode, void *data)
> >  {
> > -	struct inode *reali;
> > -
> > -	if (!dentry)
> > -		return;
> > -
> > -	reali = dentry->d_inode;
> > -
> > -	if (!reali->i_op->get_link)
> > -		inode->i_opflags |= IOP_NOFOLLOW;
> > +	return inode->i_private == data;
> > +}
> >  
> > -	inode->i_mapping = reali->i_mapping;
> > -	inode->i_private = dentry;
> > +static int shiftfs_inode_set(struct inode *inode, void *data)
> > +{
> > +	inode->i_private = data;
> > +	return 0;
> >  }
> >  
> > -static int shiftfs_make_object(struct inode *dir, struct dentry *dentry,
> > -			       umode_t mode, const char *symlink,
> > -			       struct dentry *hardlink, bool excl)
> > +static int shiftfs_create_object(struct inode *diri, struct dentry *dentry,
> > +				 umode_t mode, const char *symlink,
> > +				 struct dentry *hardlink, bool excl)
> >  {
> > -	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
> > -	struct inode *reali = real->d_inode, *newi;
> > -	const struct inode_operations *iop = reali->i_op;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > -	bool op_ok = false;
> > +	const struct cred *oldcred;
> > +	struct cred *newcred;
> > +	void *loweri_iop_ptr = NULL;
> > +	umode_t modei = mode;
> > +	struct super_block *dir_sb = diri->i_sb;
> > +	struct dentry *lowerd_new = dentry->d_fsdata;
> > +	struct inode *inode = NULL, *loweri_dir = diri->i_private;
> > +	const struct inode_operations *loweri_dir_iop = loweri_dir->i_op;
> > +	struct dentry *lowerd_link = NULL;
> >  
> >  	if (hardlink) {
> > -		op_ok = iop->link;
> > +		loweri_iop_ptr = loweri_dir_iop->link;
> >  	} else {
> >  		switch (mode & S_IFMT) {
> >  		case S_IFDIR:
> > -			op_ok = iop->mkdir;
> > +			loweri_iop_ptr = loweri_dir_iop->mkdir;
> >  			break;
> >  		case S_IFREG:
> > -			op_ok = iop->create;
> > +			loweri_iop_ptr = loweri_dir_iop->create;
> >  			break;
> >  		case S_IFLNK:
> > -			op_ok = iop->symlink;
> > +			loweri_iop_ptr = loweri_dir_iop->symlink;
> > +			break;
> > +		case S_IFSOCK:
> > +			/* fall through */
> > +		case S_IFIFO:
> > +			loweri_iop_ptr = loweri_dir_iop->mknod;
> > +			break;
> >  		}
> >  	}
> > -	if (!op_ok)
> > -		return -EINVAL;
> > +	if (!loweri_iop_ptr) {
> > +		err = -EINVAL;
> > +		goto out_iput;
> > +	}
> >  
> > +	inode_lock_nested(loweri_dir, I_MUTEX_PARENT);
> >  
> > -	newi = shiftfs_new_inode(dentry->d_sb, mode, NULL);
> > -	if (!newi)
> > -		return -ENOMEM;
> > +	if (!hardlink) {
> > +		inode = new_inode(dir_sb);
> > +		if (!inode) {
> > +			err = -ENOMEM;
> > +			goto out_iput;
> > +		}
> > +
> > +		/*
> > +		 * new_inode() will have added the new inode to the super
> > +		 * block's list of inodes. Further below we will call
> > +		 * inode_insert5() Which would perform the same operation again
> > +		 * thereby corrupting the list. To avoid this raise I_CREATING
> > +		 * in i_state which will cause inode_insert5() to skip this
> > +		 * step. I_CREATING will be cleared by d_instantiate_new()
> > +		 * below.
> > +		 */
> > +		spin_lock(&inode->i_lock);
> > +		inode->i_state |= I_CREATING;
> > +		spin_unlock(&inode->i_lock);
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > +		inode_init_owner(inode, diri, mode);
> > +		modei = inode->i_mode;
> > +	}
> >  
> > -	inode_lock_nested(reali, I_MUTEX_PARENT);
> > +	err = shiftfs_override_object_creds(dentry->d_sb, &oldcred, &newcred,
> > +					    dentry, modei, hardlink != NULL);
> > +	if (err)
> > +		goto out_iput;
> >  
> > -	err = -EINVAL;		/* shut gcc up about uninit var */
> >  	if (hardlink) {
> > -		struct dentry *realhardlink = hardlink->d_fsdata;
> > -
> > -		err = vfs_link(realhardlink, reali, new, NULL);
> > +		lowerd_link = hardlink->d_fsdata;
> > +		err = vfs_link(lowerd_link, loweri_dir, lowerd_new, NULL);
> >  	} else {
> > -		switch (mode & S_IFMT) {
> > +		switch (modei & S_IFMT) {
> >  		case S_IFDIR:
> > -			err = vfs_mkdir(reali, new, mode);
> > +			err = vfs_mkdir(loweri_dir, lowerd_new, modei);
> >  			break;
> >  		case S_IFREG:
> > -			err = vfs_create(reali, new, mode, excl);
> > +			err = vfs_create(loweri_dir, lowerd_new, modei, excl);
> >  			break;
> >  		case S_IFLNK:
> > -			err = vfs_symlink(reali, new, symlink);
> > +			err = vfs_symlink(loweri_dir, lowerd_new, symlink);
> > +			break;
> > +		case S_IFSOCK:
> > +			/* fall through */
> > +		case S_IFIFO:
> > +			err = vfs_mknod(loweri_dir, lowerd_new, modei, 0);
> > +			break;
> > +		default:
> > +			err = -EINVAL;
> > +			break;
> >  		}
> >  	}
> >  
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	shiftfs_revert_object_creds(oldcred, newcred);
> >  
> > +	if (!err && WARN_ON(!lowerd_new->d_inode))
> > +		err = -EIO;
> >  	if (err)
> > -		goto out_dput;
> > +		goto out_iput;
> > +
> > +	if (hardlink) {
> > +		inode = d_inode(hardlink);
> > +		ihold(inode);
> > +
> > +		/* copy up times from lower inode */
> > +		shiftfs_copyattr(d_inode(lowerd_link), inode);
> > +		set_nlink(d_inode(hardlink), d_inode(lowerd_link)->i_nlink);
> > +		d_instantiate(dentry, inode);
> > +	} else {
> > +		struct inode *inode_tmp;
> > +		struct inode *loweri_new = d_inode(lowerd_new);
> > +
> > +		inode_tmp = inode_insert5(inode, (unsigned long)loweri_new,
> > +					  shiftfs_inode_test, shiftfs_inode_set,
> > +					  loweri_new);
> > +		if (unlikely(inode_tmp != inode)) {
> > +			pr_err_ratelimited("shiftfs: newly created inode found in cache\n");
> > +			iput(inode_tmp);
> > +			err = -EINVAL;
> > +			goto out_iput;
> > +		}
> >  
> > -	shiftfs_fill_inode(newi, new);
> > +		ihold(loweri_new);
> > +		shiftfs_fill_inode(inode, loweri_new->i_ino, loweri_new->i_mode,
> > +				   0, lowerd_new);
> > +		d_instantiate_new(dentry, inode);
> > +	}
> >  
> > -	d_instantiate(dentry, newi);
> > +	shiftfs_copyattr(loweri_dir, diri);
> > +	if (loweri_iop_ptr == loweri_dir_iop->mkdir)
> > +		set_nlink(diri, loweri_dir->i_nlink);
> >  
> > -	new = NULL;
> > -	newi = NULL;
> > +	inode = NULL;
> >  
> > - out_dput:
> > -	dput(new);
> > -	iput(newi);
> > -	inode_unlock(reali);
> > +out_iput:
> > +	iput(inode);
> > +	inode_unlock(loweri_dir);
> >  
> >  	return err;
> >  }
> > @@ -358,7 +541,7 @@ static int shiftfs_create(struct inode *dir, struct dentry *dentry,
> >  {
> >  	mode |= S_IFREG;
> >  
> > -	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, excl);
> > +	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, excl);
> >  }
> >  
> >  static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
> > @@ -366,39 +549,52 @@ static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
> >  {
> >  	mode |= S_IFDIR;
> >  
> > -	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, false);
> > +	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
> >  }
> >  
> >  static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
> >  			struct dentry *dentry)
> >  {
> > -	return shiftfs_make_object(dir, dentry, 0, NULL, hardlink, false);
> > +	return shiftfs_create_object(dir, dentry, 0, NULL, hardlink, false);
> > +}
> > +
> > +static int shiftfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
> > +			 dev_t rdev)
> > +{
> > +	if (S_ISCHR(mode) || S_ISBLK(mode))
> > +		return -EPERM;
> 
> This isn't a blocker and isn't necessary for the Disco release but I'd
> prefer the test to be switched around:
> 
> if (!S_ISFIFO(mode) && !S_ISSOCK(mode))
> 	return -EPERM;
> 
> > +
> > +	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
> >  }
> >  
> >  static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
> >  			   const char *symlink)
> >  {
> > -	return shiftfs_make_object(dir, dentry, S_IFLNK, symlink, NULL, false);
> > +	return shiftfs_create_object(dir, dentry, S_IFLNK, symlink, NULL, false);
> >  }
> >  
> >  static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
> >  {
> > -	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
> > -	struct inode *reali = real->d_inode;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> > +	struct inode *loweri = dir->i_private;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > -
> > -	inode_lock_nested(reali, I_MUTEX_PARENT);
> > -
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > +	const struct cred *oldcred;
> >  
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	inode_lock_nested(loweri, I_MUTEX_PARENT);
> >  	if (rmdir)
> > -		err = vfs_rmdir(reali, new);
> > +		err = vfs_rmdir(loweri, lowerd);
> >  	else
> > -		err = vfs_unlink(reali, new, NULL);
> > +		err = vfs_unlink(loweri, lowerd, NULL);
> > +	inode_unlock(loweri);
> > +	revert_creds(oldcred);
> >  
> > -	shiftfs_old_creds(oldcred, &newcred);
> > -	inode_unlock(reali);
> > +	shiftfs_copyattr(loweri, dir);
> > +	set_nlink(d_inode(dentry), loweri->i_nlink);
> > +	if (!err)
> > +		d_drop(dentry);
> > +
> > +	set_nlink(dir, loweri->i_nlink);
> >  
> >  	return err;
> >  }
> > @@ -417,27 +613,30 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
> >  			  struct inode *newdir, struct dentry *new,
> >  			  unsigned int flags)
> >  {
> > -	struct dentry *rodd = olddir->i_private, *rndd = newdir->i_private,
> > -		*realold = old->d_fsdata,
> > -		*realnew = new->d_fsdata, *trap;
> > -	struct inode *realolddir = rodd->d_inode, *realnewdir = rndd->d_inode;
> > +	struct dentry *lowerd_dir_old = old->d_parent->d_fsdata,
> > +		      *lowerd_dir_new = new->d_parent->d_fsdata,
> > +		      *lowerd_old = old->d_fsdata, *lowerd_new = new->d_fsdata,
> > +		      *trapd;
> > +	struct inode *loweri_dir_old = lowerd_dir_old->d_inode,
> > +		     *loweri_dir_new = lowerd_dir_new->d_inode;
> >  	int err = -EINVAL;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> >  
> > -	trap = lock_rename(rndd, rodd);
> > +	trapd = lock_rename(lowerd_dir_new, lowerd_dir_old);
> >  
> > -	if (trap == realold || trap == realnew)
> > +	if (trapd == lowerd_old || trapd == lowerd_new)
> >  		goto out_unlock;
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, old->d_sb);
> > -
> > -	err = vfs_rename(realolddir, realold, realnewdir,
> > -			 realnew, NULL, flags);
> > +	oldcred = shiftfs_override_creds(old->d_sb);
> > +	err = vfs_rename(loweri_dir_old, lowerd_old, loweri_dir_new, lowerd_new,
> > +			 NULL, flags);
> > +	revert_creds(oldcred);
> >  
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	shiftfs_copyattr(loweri_dir_old, olddir);
> > +	shiftfs_copyattr(loweri_dir_new, newdir);
> >  
> > - out_unlock:
> > -	unlock_rename(rndd, rodd);
> > +out_unlock:
> > +	unlock_rename(lowerd_dir_new, lowerd_dir_old);
> >  
> >  	return err;
> >  }
> > @@ -445,304 +644,1205 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
> >  static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
> >  				     unsigned int flags)
> >  {
> > -	struct dentry *real = dir->i_private, *new;
> > -	struct inode *reali = real->d_inode, *newi;
> > -	const struct cred *oldcred, *newcred;
> > -
> > -	inode_lock(reali);
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > -	inode_unlock(reali);
> > +	struct dentry *new;
> > +	struct inode *newi;
> > +	const struct cred *oldcred;
> > +	struct dentry *lowerd = dentry->d_parent->d_fsdata;
> > +	struct inode *inode = NULL, *loweri = lowerd->d_inode;
> > +
> > +	inode_lock(loweri);
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	new = lookup_one_len(dentry->d_name.name, lowerd, dentry->d_name.len);
> > +	revert_creds(oldcred);
> > +	inode_unlock(loweri);
> >  
> >  	if (IS_ERR(new))
> >  		return new;
> >  
> >  	dentry->d_fsdata = new;
> >  
> > -	newi = NULL;
> > -	if (!new->d_inode)
> > +	newi = new->d_inode;
> > +	if (!newi)
> >  		goto out;
> >  
> > -	newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
> > -	if (!newi) {
> > +	inode = iget5_locked(dentry->d_sb, (unsigned long)newi,
> > +			     shiftfs_inode_test, shiftfs_inode_set, newi);
> > +	if (!inode) {
> >  		dput(new);
> >  		return ERR_PTR(-ENOMEM);
> >  	}
> > +	if (inode->i_state & I_NEW) {
> > +		/*
> > +		 * inode->i_private set by shiftfs_inode_set(), but we still
> > +		 * need to take a reference
> > +		*/
> > +		ihold(newi);
> > +		shiftfs_fill_inode(inode, newi->i_ino, newi->i_mode, 0, new);
> > +		unlock_new_inode(inode);
> > +	}
> >  
> > - out:
> > -	return d_splice_alias(newi, dentry);
> > +out:
> > +	return d_splice_alias(inode, dentry);
> >  }
> >  
> >  static int shiftfs_permission(struct inode *inode, int mask)
> >  {
> > -	struct dentry *real = inode->i_private;
> > -	struct inode *reali = real->d_inode;
> > -	const struct inode_operations *iop = reali->i_op;
> >  	int err;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> > +	struct inode *loweri = inode->i_private;
> >  
> > -	if (mask & MAY_NOT_BLOCK)
> > +	if (!loweri) {
> > +		WARN_ON(!(mask & MAY_NOT_BLOCK));
> >  		return -ECHILD;
> > +	}
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
> > -	if (iop->permission)
> > -		err = iop->permission(reali, mask);
> > -	else
> > -		err = generic_permission(reali, mask);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	err = generic_permission(inode, mask);
> > +	if (err)
> > +		return err;
> > +
> > +	oldcred = shiftfs_override_creds(inode->i_sb);
> > +	err = inode_permission(loweri, mask);
> > +	revert_creds(oldcred);
> > +
> > +	return err;
> > +}
> > +
> > +static int shiftfs_fiemap(struct inode *inode,
> > +			  struct fiemap_extent_info *fieinfo, u64 start,
> > +			  u64 len)
> > +{
> > +	int err;
> > +	const struct cred *oldcred;
> > +	struct inode *loweri = inode->i_private;
> > +
> > +	if (!loweri->i_op->fiemap)
> > +		return -EOPNOTSUPP;
> > +
> > +	oldcred = shiftfs_override_creds(inode->i_sb);
> > +	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
> > +		filemap_write_and_wait(loweri->i_mapping);
> > +	err = loweri->i_op->fiemap(loweri, fieinfo, start, len);
> > +	revert_creds(oldcred);
> > +
> > +	return err;
> > +}
> > +
> > +static int shiftfs_tmpfile(struct inode *dir, struct dentry *dentry,
> > +			   umode_t mode)
> > +{
> > +	int err;
> > +	const struct cred *oldcred;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> > +	struct inode *loweri = dir->i_private;
> > +
> > +	if (!loweri->i_op->tmpfile)
> > +		return -EOPNOTSUPP;
> > +
> > +	oldcred = shiftfs_override_creds(dir->i_sb);
> > +	err = loweri->i_op->tmpfile(loweri, lowerd, mode);
> > +	revert_creds(oldcred);
> >  
> >  	return err;
> >  }
> >  
> >  static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
> >  {
> > -	struct dentry *real = dentry->d_fsdata;
> > -	struct inode *reali = real->d_inode;
> > -	const struct inode_operations *iop = reali->i_op;
> > +	struct dentry *lowerd = dentry->d_fsdata;
> > +	struct inode *loweri = lowerd->d_inode;
> >  	struct iattr newattr = *attr;
> > -	const struct cred *oldcred, *newcred;
> > +	const struct cred *oldcred;
> >  	struct super_block *sb = dentry->d_sb;
> >  	int err;
> >  
> > +	err = setattr_prepare(dentry, attr);
> > +	if (err)
> > +		return err;
> > +
> >  	newattr.ia_uid = KUIDT_INIT(from_kuid(sb->s_user_ns, attr->ia_uid));
> >  	newattr.ia_gid = KGIDT_INIT(from_kgid(sb->s_user_ns, attr->ia_gid));
> >  
> > -	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
> > -	inode_lock(reali);
> > -	if (iop->setattr)
> > -		err = iop->setattr(real, &newattr);
> > -	else
> > -		err = simple_setattr(real, &newattr);
> > -	inode_unlock(reali);
> > -	shiftfs_old_creds(oldcred, &newcred);
> > +	inode_lock(loweri);
> > +	oldcred = shiftfs_override_creds(dentry->d_sb);
> > +	err = notify_change(lowerd, attr, NULL);
> > +	revert_creds(oldcred);
> > +	inode_unlock(loweri);
> >  
> > -	if (err)
> > -		return err;
> > +	shiftfs_copyattr(loweri, d_inode(dentry));
> >  
> > -	/* all OK, reflect the change on our inode */
> > -	setattr_copy(d_inode(dentry), attr);
> > -	return 0;
> > +	return err;
> >  }
> >  
> >  static int shiftfs_getattr(const struct path *path, struct kstat *stat,
> >  			   u32 request_mask, unsigned int query_flags)
> >  {
> >  	struct inode *inode = path->dentry->d_inode;
> > -	struct dentry *real = path->dentry->d_fsdata;
> > -	struct inode *reali = real->d_inode;
> > -	const struct inode_operations *iop = reali->i_op;
> > -	struct path newpath = { .mnt = path->dentry->d_sb->s_fs_info, .dentry = real };
> > -	int err = 0;
> > -
> > -	if (iop->getattr)
> > -		err = iop->getattr(&newpath, stat, request_mask, query_flags);
> > -	else
> > -		generic_fillattr(reali, stat);
> > +	struct dentry *lowerd = path->dentry->d_fsdata;
> > +	struct inode *loweri = lowerd->d_inode;
> > +	struct shiftfs_super_info *info = path->dentry->d_sb->s_fs_info;
> > +	struct path newpath = { .mnt = info->mnt, .dentry = lowerd };
> > +	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
> > +	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
> > +	const struct cred *oldcred;
> > +	int err;
> > +
> > +	oldcred = shiftfs_override_creds(inode->i_sb);
> > +	err = vfs_getattr(&newpath, stat, request_mask, query_flags);
> > +	revert_creds(oldcred);
> >  
> >  	if (err)
> >  		return err;
> >  
> >  	/* transform the underlying id */
> > -	stat->uid = make_kuid(inode->i_sb->s_user_ns, __kuid_val(stat->uid));
> > -	stat->gid = make_kgid(inode->i_sb->s_user_ns, __kgid_val(stat->gid));
> > +	stat->uid = shift_kuid(from_ns, to_ns, stat->uid);
> > +	stat->gid = shift_kgid(from_ns, to_ns, stat->gid);
> >  	return 0;
> >  }
> >  
> > -static const struct inode_operations shiftfs_inode_ops = {
> > -	.lookup		= shiftfs_lookup,
> > -	.getattr	= shiftfs_getattr,
> > -	.setattr	= shiftfs_setattr,
> > -	.permission	= shiftfs_permission,
> > -	.mkdir		= shiftfs_mkdir,
> > -	.symlink	= shiftfs_symlink,
> > -	.get_link	= shiftfs_get_link,
> > -	.readlink	= shiftfs_readlink,
> > -	.unlink		= shiftfs_unlink,
> > -	.rmdir		= shiftfs_rmdir,
> > -	.rename		= shiftfs_rename,
> > -	.link		= shiftfs_link,
> > -	.create		= shiftfs_create,
> > -	.mknod		= NULL,	/* no special files currently */
> > -	.listxattr	= shiftfs_listxattr,
> > -};
> > +#ifdef CONFIG_SHIFT_FS_POSIX_ACL
> >  
> > -static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
> > -				       struct dentry *dentry)
> > +static int
> > +shift_acl_ids(struct user_namespace *from, struct user_namespace *to,
> > +	      struct posix_acl *acl)
> >  {
> > -	struct inode *inode;
> > -
> > -	inode = new_inode(sb);
> > -	if (!inode)
> > -		return NULL;
> > -
> > -	/*
> > -	 * our inode is completely vestigial.  All lookups, getattr
> > -	 * and permission checks are done on the underlying inode, so
> > -	 * what the user sees is entirely from the underlying inode.
> > -	 */
> > -	mode &= S_IFMT;
> > +	int i;
> > +
> > +	for (i = 0; i < acl->a_count; i++) {
> > +		struct posix_acl_entry *e = &acl->a_entries[i];
> > +		switch(e->e_tag) {
> > +		case ACL_USER:
> > +			e->e_uid = shift_kuid(from, to, e->e_uid);
> > +			if (!uid_valid(e->e_uid))
> > +				return -EOVERFLOW;
> > +			break;
> > +		case ACL_GROUP:
> > +			e->e_gid = shift_kgid(from, to, e->e_gid);
> > +			if (!gid_valid(e->e_gid))
> > +				return -EOVERFLOW;
> > +			break;
> > +		}
> > +	}
> > +	return 0;
> > +}
> >  
> > -	inode->i_ino = get_next_ino();
> > -	inode->i_mode = mode;
> > -	inode->i_flags |= S_NOATIME | S_NOCMTIME;
> > +static void
> > +shift_acl_xattr_ids(struct user_namespace *from, struct user_namespace *to,
> > +		    void *value, size_t size)
> > +{
> > +	struct posix_acl_xattr_header *header = value;
> > +	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
> > +	int count;
> > +	kuid_t kuid;
> > +	kgid_t kgid;
> >  
> > -	inode->i_op = &shiftfs_inode_ops;
> > +	if (!value)
> > +		return;
> > +	if (size < sizeof(struct posix_acl_xattr_header))
> > +		return;
> > +	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
> > +		return;
> >  
> > -	shiftfs_fill_inode(inode, dentry);
> > +	count = posix_acl_xattr_count(size);
> > +	if (count < 0)
> > +		return;
> > +	if (count == 0)
> > +		return;
> >  
> > -	return inode;
> > +	for (end = entry + count; entry != end; entry++) {
> > +		switch(le16_to_cpu(entry->e_tag)) {
> > +		case ACL_USER:
> > +			kuid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
> > +			kuid = shift_kuid(from, to, kuid);
> > +			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, kuid));
> > +			break;
> > +		case ACL_GROUP:
> > +			kgid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
> > +			kgid = shift_kgid(from, to, kgid);
> > +			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, kgid));
> > +			break;
> > +		default:
> > +			break;
> > +		}
> > +	}
> >  }
> >  
> > -static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
> > +static struct posix_acl *shiftfs_get_acl(struct inode *inode, int type)
> >  {
> > -	struct super_block *sb = dentry->d_sb;
> > -	struct shiftfs_super_info *ssi = sb->s_fs_info;
> > +	struct inode *loweri = inode->i_private;
> > +	const struct cred *oldcred;
> > +	struct posix_acl *lower_acl, *acl = NULL;
> > +	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
> > +	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
> > +	int size;
> > +	int err;
> >  
> > -	if (ssi->mark)
> > -		seq_show_option(m, "mark", NULL);
> > +	if (!IS_POSIXACL(loweri))
> > +		return NULL;
> >  
> > -	return 0;
> > -}
> > +	oldcred = shiftfs_override_creds(inode->i_sb);
> > +	lower_acl = get_acl(loweri, type);
> > +	revert_creds(oldcred);
> >  
> > -static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
> > -{
> > -	struct super_block *sb = dentry->d_sb;
> > -	struct shiftfs_super_info *ssi = sb->s_fs_info;
> > -	struct dentry *root = sb->s_root;
> > -	struct dentry *realroot = root->d_fsdata;
> > -	struct path realpath = { .mnt = ssi->mnt, .dentry = realroot };
> > -	int err;
> > +	if (lower_acl && !IS_ERR(lower_acl)) {
> > +		/* XXX: export posix_acl_clone? */
> > +		size = sizeof(struct posix_acl) +
> > +		       lower_acl->a_count * sizeof(struct posix_acl_entry);
> > +		acl = kmemdup(lower_acl, size, GFP_KERNEL);
> > +		posix_acl_release(lower_acl);
> >  
> > -	err = vfs_statfs(&realpath, buf);
> > -	if (err)
> > -		return err;
> > +		if (!acl)
> > +			return ERR_PTR(-ENOMEM);
> >  
> > -	buf->f_type = sb->s_magic;
> > +		refcount_set(&acl->a_refcount, 1);
> >  
> > -	return 0;
> > +		err = shift_acl_ids(from_ns, to_ns, acl);
> > +		if (err) {
> > +			kfree(acl);
> > +			return ERR_PTR(err);
> > +		}
> > +	}
> > +
> > +	return acl;
> >  }
> >  
> > -static void shiftfs_put_super(struct super_block *sb)
> > +static int
> > +shiftfs_posix_acl_xattr_get(const struct xattr_handler *handler,
> > +			   struct dentry *dentry, struct inode *inode,
> > +			   const char *name, void *buffer, size_t size)
> >  {
> > -	struct shiftfs_super_info *ssi = sb->s_fs_info;
> > +	struct inode *loweri = inode->i_private;
> > +	int ret;
> > +
> > +	ret = shiftfs_xattr_get(NULL, dentry, inode, handler->name,
> > +				buffer, size);
> > +	if (ret < 0)
> > +		return ret;
> >  
> > -	mntput(ssi->mnt);
> > -	put_user_ns(ssi->userns);
> > -	kfree(ssi);
> > +	inode_lock(loweri);
> > +	shift_acl_xattr_ids(loweri->i_sb->s_user_ns, inode->i_sb->s_user_ns,
> > +			    buffer, size);
> > +	inode_unlock(loweri);
> > +	return ret;
> >  }
> >  
> > -static const struct xattr_handler shiftfs_xattr_handler = {
> > -	.prefix = "",
> > -	.get    = shiftfs_xattr_get,
> > -	.set    = shiftfs_xattr_set,
> > -};
> > +static int
> > +shiftfs_posix_acl_xattr_set(const struct xattr_handler *handler,
> > +			    struct dentry *dentry, struct inode *inode,
> > +			    const char *name, const void *value,
> > +			    size_t size, int flags)
> > +{
> > +	struct inode *loweri = inode->i_private;
> > +	int err;
> >  
> > -const struct xattr_handler *shiftfs_xattr_handlers[] = {
> > -	&shiftfs_xattr_handler,
> > -	NULL
> > -};
> > +	if (!IS_POSIXACL(loweri) || !loweri->i_op->set_acl)
> > +		return -EOPNOTSUPP;
> > +	if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
> > +		return value ? -EACCES : 0;
> > +	if (!inode_owner_or_capable(inode))
> > +		return -EPERM;
> > +
> > +	if (value) {
> > +		shift_acl_xattr_ids(inode->i_sb->s_user_ns,
> > +				    loweri->i_sb->s_user_ns,
> > +				    (void *)value, size);
> > +		err = shiftfs_setxattr(dentry, inode, handler->name, value,
> > +				       size, flags);
> > +	} else {
> > +		err = shiftfs_removexattr(dentry, handler->name);
> > +	}
> >  
> > -static const struct super_operations shiftfs_super_ops = {
> > -	.put_super	= shiftfs_put_super,
> > -	.show_options	= shiftfs_show_options,
> > -	.statfs		= shiftfs_statfs,
> > +	if (!err)
> > +		shiftfs_copyattr(loweri, inode);
> > +
> > +	return err;
> > +}
> > +
> > +static const struct xattr_handler
> > +shiftfs_posix_acl_access_xattr_handler = {
> > +	.name = XATTR_NAME_POSIX_ACL_ACCESS,
> > +	.flags = ACL_TYPE_ACCESS,
> > +	.get = shiftfs_posix_acl_xattr_get,
> > +	.set = shiftfs_posix_acl_xattr_set,
> >  };
> >  
> > -struct shiftfs_data {
> > -	void *data;
> > -	const char *path;
> > +static const struct xattr_handler
> > +shiftfs_posix_acl_default_xattr_handler = {
> > +	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
> > +	.flags = ACL_TYPE_DEFAULT,
> > +	.get = shiftfs_posix_acl_xattr_get,
> > +	.set = shiftfs_posix_acl_xattr_set,
> >  };
> >  
> > -static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
> > -			      int silent)
> > -{
> > -	struct shiftfs_data *data = raw_data;
> > -	char *name = kstrdup(data->path, GFP_KERNEL);
> > -	int err = -ENOMEM;
> > -	struct shiftfs_super_info *ssi = NULL;
> > -	struct path path;
> > -	struct dentry *dentry;
> > +#else /* !CONFIG_SHIFT_FS_POSIX_ACL */
> >  
> > -	if (!name)
> > -		goto out;
> > +#define shiftfs_get_acl NULL
> >  
> > -	ssi = kzalloc(sizeof(*ssi), GFP_KERNEL);
> > -	if (!ssi)
> > -		goto out;
> > +#endif /* CONFIG_SHIFT_FS_POSIX_ACL */
> >  
> > -	err = -EPERM;
> > -	err = shiftfs_parse_options(ssi, data->data);
> > +static const struct inode_operations shiftfs_dir_inode_operations = {
> > +	.lookup		= shiftfs_lookup,
> > +	.mkdir		= shiftfs_mkdir,
> > +	.symlink	= shiftfs_symlink,
> > +	.unlink		= shiftfs_unlink,
> > +	.rmdir		= shiftfs_rmdir,
> > +	.rename		= shiftfs_rename,
> > +	.link		= shiftfs_link,
> > +	.setattr	= shiftfs_setattr,
> > +	.create		= shiftfs_create,
> > +	.mknod		= shiftfs_mknod,
> > +	.permission	= shiftfs_permission,
> > +	.getattr	= shiftfs_getattr,
> > +	.listxattr	= shiftfs_listxattr,
> > +	.get_acl	= shiftfs_get_acl,
> > +};
> > +
> > +static const struct inode_operations shiftfs_file_inode_operations = {
> > +	.fiemap		= shiftfs_fiemap,
> > +	.getattr	= shiftfs_getattr,
> > +	.get_acl	= shiftfs_get_acl,
> > +	.listxattr	= shiftfs_listxattr,
> > +	.permission	= shiftfs_permission,
> > +	.setattr	= shiftfs_setattr,
> > +	.tmpfile	= shiftfs_tmpfile,
> > +};
> > +
> > +static const struct inode_operations shiftfs_special_inode_operations = {
> > +	.getattr	= shiftfs_getattr,
> > +	.get_acl	= shiftfs_get_acl,
> > +	.listxattr	= shiftfs_listxattr,
> > +	.permission	= shiftfs_permission,
> > +	.setattr	= shiftfs_setattr,
> > +};
> > +
> > +static const struct inode_operations shiftfs_symlink_inode_operations = {
> > +	.getattr	= shiftfs_getattr,
> > +	.get_link	= shiftfs_get_link,
> > +	.listxattr	= shiftfs_listxattr,
> > +	.setattr	= shiftfs_setattr,
> > +};
> > +
> > +static struct file *shiftfs_open_realfile(const struct file *file,
> > +					  struct path *realpath)
> > +{
> > +	struct file *lowerf;
> > +	const struct cred *oldcred;
> > +	struct inode *inode = file_inode(file);
> > +	struct inode *loweri = realpath->dentry->d_inode;
> > +	struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
> > +
> > +	oldcred = shiftfs_override_creds(inode->i_sb);
> > +	/* XXX: open_with_fake_path() not gauranteed to stay around, if
> > +	 * removed use dentry_open() */
> > +	lowerf = open_with_fake_path(realpath, file->f_flags, loweri, info->creator_cred);
> > +	revert_creds(oldcred);
> > +
> > +	return lowerf;
> > +}
> > +
> > +#define SHIFTFS_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
> > +
> > +static int shiftfs_change_flags(struct file *file, unsigned int flags)
> > +{
> > +	struct inode *inode = file_inode(file);
> > +	int err;
> > +
> > +	/* if some flag changed that cannot be changed then something's amiss */
> > +	if (WARN_ON((file->f_flags ^ flags) & ~SHIFTFS_SETFL_MASK))
> > +		return -EIO;
> > +
> > +	flags &= SHIFTFS_SETFL_MASK;
> > +
> > +	if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
> > +		return -EPERM;
> > +
> > +	if (flags & O_DIRECT) {
> > +		if (!file->f_mapping->a_ops ||
> > +		    !file->f_mapping->a_ops->direct_IO)
> > +			return -EINVAL;
> > +	}
> > +
> > +	if (file->f_op->check_flags) {
> > +		err = file->f_op->check_flags(flags);
> > +		if (err)
> > +			return err;
> > +	}
> > +
> > +	spin_lock(&file->f_lock);
> > +	file->f_flags = (file->f_flags & ~SHIFTFS_SETFL_MASK) | flags;
> > +	spin_unlock(&file->f_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static int shiftfs_real_fdget(const struct file *file, struct fd *lowerfd)
> > +{
> > +	struct shiftfs_file_info *file_info = file->private_data;
> > +	struct file *realfile = file_info->realfile;
> > +
> > +	lowerfd->flags = 0;
> > +	lowerfd->file = realfile;
> > +
> > +	/* Did the flags change since open? */
> > +	if (unlikely(file->f_flags & lowerfd->file->f_flags))
> 
> Is this the right bitwise operation? Maybe I'm misunderstanding the
> intent but I'd have thought you would want to XOR the two f_flags to
> detect any changes.
> 
> > +		return shiftfs_change_flags(lowerfd->file, file->f_flags);
> > +
> > +	return 0;
> > +}
> > +
> > +static int shiftfs_open(struct inode *inode, struct file *file)
> > +{
> > +	struct shiftfs_super_info *ssi = inode->i_sb->s_fs_info;
> > +	struct shiftfs_file_info *file_info;
> > +	struct file *realfile;
> > +	struct path *realpath;
> > +
> > +	file_info = kmem_cache_zalloc(shiftfs_file_info_cache, GFP_KERNEL);
> > +	if (!file_info)
> > +		return -ENOMEM;
> > +	file->private_data = file_info;
> > +
> > +	realpath = &file_info->realpath;
> > +	realpath->mnt = ssi->mnt;
> > +	realpath->dentry = file->f_path.dentry->d_fsdata;
> > +
> > +	realfile = shiftfs_open_realfile(file, realpath);
> > +	if (IS_ERR(realfile)) {
> > +		kfree(file_info);
> 
> This should be a call to kmem_cache_free() instead of kfree().
> 
> Also, I think you'll want to set file->private_data to NULL in this
> error path so that we don't try to re-free it shiftfs_release().

I think we simply move the initialization of private_data after the
check for shiftfs_open_realfile() then we fix all of that without
assigning twice.

> 
> > +		return PTR_ERR(realfile);
> > +	}
> > +
> > +	file_info->realfile = realfile;
> > +	return 0;
> > +}
> > +
> > +static int shiftfs_release(struct inode *inode, struct file *file)
> > +{
> > +	struct shiftfs_file_info *file_info = file->private_data;
> > +
> > +	fput(file_info->realfile);
> 
> As alluded to above, I think shiftfs_release() can be reached when
> shiftfs_open() fails. Check dentry_open() for such a sequence. In that
> case, you'll only want to call fput() when file_info and
> file_info->realfile are both non-NULL.
> 
> > +	kmem_cache_free(shiftfs_file_info_cache, file_info);
> 
> May as well only do this when file_info is non-NULL, too.
> 
> Tyler
> 
> > +	return 0;
> > +}
> > +
> > +static loff_t shiftfs_llseek(struct file *file, loff_t offset, int whence)
> > +{
> > +	struct inode *realinode = file_inode(file)->i_private;
> > +
> > +	return generic_file_llseek_size(file, offset, whence,
> > +					realinode->i_sb->s_maxbytes,
> > +					i_size_read(realinode));
> > +}
> > +
> > +/* XXX: Need to figure out what to to about atime updates, maybe other
> > + * timestamps too ... ref. ovl_file_accessed() */
> > +
> > +static rwf_t shiftfs_iocb_to_rwf(struct kiocb *iocb)
> > +{
> > +	int ifl = iocb->ki_flags;
> > +	rwf_t flags = 0;
> > +
> > +	if (ifl & IOCB_NOWAIT)
> > +		flags |= RWF_NOWAIT;
> > +	if (ifl & IOCB_HIPRI)
> > +		flags |= RWF_HIPRI;
> > +	if (ifl & IOCB_DSYNC)
> > +		flags |= RWF_DSYNC;
> > +	if (ifl & IOCB_SYNC)
> > +		flags |= RWF_SYNC;
> > +
> > +	return flags;
> > +}
> > +
> > +static ssize_t shiftfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> > +{
> > +	struct file *file = iocb->ki_filp;
> > +	struct fd lowerfd;
> > +	const struct cred *oldcred;
> > +	ssize_t ret;
> > +
> > +	if (!iov_iter_count(iter))
> > +		return 0;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		return ret;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	ret = vfs_iter_read(lowerfd.file, iter, &iocb->ki_pos,
> > +			    shiftfs_iocb_to_rwf(iocb));
> > +	revert_creds(oldcred);
> > +
> > +	shiftfs_file_accessed(file);
> > +
> > +	fdput(lowerfd);
> > +	return ret;
> > +}
> > +
> > +static ssize_t shiftfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > +{
> > +	struct file *file = iocb->ki_filp;
> > +	struct inode *inode = file_inode(file);
> > +	struct fd lowerfd;
> > +	const struct cred *oldcred;
> > +	ssize_t ret;
> > +
> > +	if (!iov_iter_count(iter))
> > +		return 0;
> > +
> > +	inode_lock(inode);
> > +	/* Update mode */
> > +	shiftfs_copyattr(inode->i_private, inode);
> > +	ret = file_remove_privs(file);
> > +	if (ret)
> > +		goto out_unlock;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		goto out_unlock;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	file_start_write(lowerfd.file);
> > +	ret = vfs_iter_write(lowerfd.file, iter, &iocb->ki_pos,
> > +			     shiftfs_iocb_to_rwf(iocb));
> > +	file_end_write(lowerfd.file);
> > +	revert_creds(oldcred);
> > +
> > +	/* Update size */
> > +	shiftfs_copyattr(inode->i_private, inode);
> > +
> > +	fdput(lowerfd);
> > +
> > +out_unlock:
> > +	inode_unlock(inode);
> > +	return ret;
> > +}
> > +
> > +static int shiftfs_fsync(struct file *file, loff_t start, loff_t end,
> > +			 int datasync)
> > +{
> > +	struct fd lowerfd;
> > +	const struct cred *oldcred;
> > +	int ret;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		return ret;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	ret = vfs_fsync_range(lowerfd.file, start, end, datasync);
> > +	revert_creds(oldcred);
> > +
> > +	fdput(lowerfd);
> > +	return ret;
> > +}
> > +
> > +static int shiftfs_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct shiftfs_file_info *file_info = file->private_data;
> > +	struct file *realfile = file_info->realfile;
> > +	const struct cred *oldcred;
> > +	int ret;
> > +
> > +	if (!realfile->f_op->mmap)
> > +		return -ENODEV;
> > +
> > +	if (WARN_ON(file != vma->vm_file))
> > +		return -EIO;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	vma->vm_file = get_file(realfile);
> > +	ret = call_mmap(vma->vm_file, vma);
> > +	revert_creds(oldcred);
> > +
> > +	shiftfs_file_accessed(file);
> > +
> > +	if (ret)
> > +		fput(realfile); /* Drop refcount from new vm_file value */
> > +	else
> > +		fput(file); /* Drop refcount from previous vm_file value */
> > +
> > +	return ret;
> > +}
> > +
> > +static long shiftfs_fallocate(struct file *file, int mode, loff_t offset,
> > +			      loff_t len)
> > +{
> > +	struct inode *inode = file_inode(file);
> > +	struct inode *loweri = inode->i_private;
> > +	struct fd lowerfd;
> > +	const struct cred *oldcred;
> > +	int ret;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		return ret;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	ret = vfs_fallocate(lowerfd.file, mode, offset, len);
> > +	revert_creds(oldcred);
> > +
> > +	/* Update size */
> > +	shiftfs_copyattr(loweri, inode);
> > +
> > +	fdput(lowerfd);
> > +	return ret;
> > +}
> > +
> > +static int shiftfs_fadvise(struct file *file, loff_t offset, loff_t len,
> > +			   int advice)
> > +{
> > +	struct fd lowerfd;
> > +	const struct cred *oldcred;
> > +	int ret;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		return ret;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	ret = vfs_fadvise(lowerfd.file, offset, len, advice);
> > +	revert_creds(oldcred);
> > +
> > +	fdput(lowerfd);
> > +	return ret;
> > +}
> > +
> > +static int shiftfs_override_ioctl_creds(const struct super_block *sb,
> > +					const struct cred **oldcred,
> > +					struct cred **newcred)
> > +{
> > +	kuid_t fsuid = current_fsuid();
> > +	kgid_t fsgid = current_fsgid();
> > +
> > +	*oldcred = shiftfs_override_creds(sb);
> > +
> > +	*newcred = prepare_creds();
> > +	if (!*newcred) {
> > +		revert_creds(*oldcred);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	(*newcred)->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, fsuid));
> > +	(*newcred)->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, fsgid));
> > +
> > +	/* clear all caps to prevent bypassing capable() checks */
> > +	cap_clear((*newcred)->cap_bset);
> > +	cap_clear((*newcred)->cap_effective);
> > +	cap_clear((*newcred)->cap_inheritable);
> > +	cap_clear((*newcred)->cap_permitted);
> > +
> > +	put_cred(override_creds(*newcred));
> > +	return 0;
> > +}
> > +
> > +static inline void shiftfs_revert_ioctl_creds(const struct cred *oldcred,
> > +					      struct cred *newcred)
> > +{
> > +	return shiftfs_revert_object_creds(oldcred, newcred);
> > +}
> > +
> > +static long shiftfs_real_ioctl(struct file *file, unsigned int cmd,
> > +			       unsigned long arg)
> > +{
> > +	long ret = 0;
> > +	struct fd lowerfd;
> > +	struct cred *newcred;
> > +	const struct cred *oldcred;
> > +	struct super_block *sb = file->f_path.dentry->d_sb;
> > +
> > +	ret = shiftfs_real_fdget(file, &lowerfd);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = shiftfs_override_ioctl_creds(sb, &oldcred, &newcred);
> > +	if (ret)
> > +		goto out_fdput;
> > +
> > +	ret = vfs_ioctl(lowerfd.file, cmd, arg);
> > +
> > +	shiftfs_revert_ioctl_creds(oldcred, newcred);
> > +
> > +	shiftfs_copyattr(file_inode(lowerfd.file), file_inode(file));
> > +	shiftfs_copyflags(file_inode(lowerfd.file), file_inode(file));
> > +
> > +out_fdput:
> > +	fdput(lowerfd);
> > +
> > +	return ret;
> > +}
> > +
> > +static long shiftfs_ioctl(struct file *file, unsigned int cmd,
> > +			  unsigned long arg)
> > +{
> > +	switch (cmd) {
> > +	case FS_IOC_GETVERSION:
> > +		/* fall through */
> > +	case FS_IOC_GETFLAGS:
> > +		/* fall through */
> > +	case FS_IOC_SETFLAGS:
> > +		break;
> > +	default:
> > +		return -ENOTTY;
> > +	}
> > +
> > +	return shiftfs_real_ioctl(file, cmd, arg);
> > +}
> > +
> > +static long shiftfs_compat_ioctl(struct file *file, unsigned int cmd,
> > +				 unsigned long arg)
> > +{
> > +	switch (cmd) {
> > +	case FS_IOC32_GETVERSION:
> > +		/* fall through */
> > +	case FS_IOC32_GETFLAGS:
> > +		/* fall through */
> > +	case FS_IOC32_SETFLAGS:
> > +		break;
> > +	default:
> > +		return -ENOIOCTLCMD;
> > +	}
> > +
> > +	return shiftfs_real_ioctl(file, cmd, arg);
> > +}
> > +
> > +enum shiftfs_copyop {
> > +	SHIFTFS_COPY,
> > +	SHIFTFS_CLONE,
> > +	SHIFTFS_DEDUPE,
> > +};
> > +
> > +static ssize_t shiftfs_copyfile(struct file *file_in, loff_t pos_in,
> > +				struct file *file_out, loff_t pos_out, u64 len,
> > +				unsigned int flags, enum shiftfs_copyop op)
> > +{
> > +	ssize_t ret;
> > +	struct fd real_in, real_out;
> > +	const struct cred *oldcred;
> > +	struct inode *inode_out = file_inode(file_out);
> > +	struct inode *loweri = inode_out->i_private;
> > +
> > +	ret = shiftfs_real_fdget(file_out, &real_out);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = shiftfs_real_fdget(file_in, &real_in);
> > +	if (ret) {
> > +		fdput(real_out);
> > +		return ret;
> > +	}
> > +
> > +	oldcred = shiftfs_override_creds(inode_out->i_sb);
> > +	switch (op) {
> > +	case SHIFTFS_COPY:
> > +		ret = vfs_copy_file_range(real_in.file, pos_in, real_out.file,
> > +					  pos_out, len, flags);
> > +		break;
> > +
> > +	case SHIFTFS_CLONE:
> > +		ret = vfs_clone_file_range(real_in.file, pos_in, real_out.file,
> > +					   pos_out, len, flags);
> > +		break;
> > +
> > +	case SHIFTFS_DEDUPE:
> > +		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
> > +						real_out.file, pos_out, len,
> > +						flags);
> > +		break;
> > +	}
> > +	revert_creds(oldcred);
> > +
> > +	/* Update size */
> > +	shiftfs_copyattr(loweri, inode_out);
> > +
> > +	fdput(real_in);
> > +	fdput(real_out);
> > +
> > +	return ret;
> > +}
> > +
> > +static ssize_t shiftfs_copy_file_range(struct file *file_in, loff_t pos_in,
> > +				       struct file *file_out, loff_t pos_out,
> > +				       size_t len, unsigned int flags)
> > +{
> > +	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
> > +				SHIFTFS_COPY);
> > +}
> > +
> > +static loff_t shiftfs_remap_file_range(struct file *file_in, loff_t pos_in,
> > +				       struct file *file_out, loff_t pos_out,
> > +				       loff_t len, unsigned int remap_flags)
> > +{
> > +	enum shiftfs_copyop op;
> > +
> > +	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
> > +		return -EINVAL;
> > +
> > +	if (remap_flags & REMAP_FILE_DEDUP)
> > +		op = SHIFTFS_DEDUPE;
> > +	else
> > +		op = SHIFTFS_CLONE;
> > +
> > +	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len,
> > +				remap_flags, op);
> > +}
> > +
> > +static int shiftfs_iterate_shared(struct file *file, struct dir_context *ctx)
> > +{
> > +	const struct cred *oldcred;
> > +	int err = -ENOTDIR;
> > +	struct shiftfs_file_info *file_info = file->private_data;
> > +	struct file *realfile = file_info->realfile;
> > +
> > +	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
> > +	err = iterate_dir(realfile, ctx);
> > +	revert_creds(oldcred);
> > +
> > +	return err;
> > +}
> > +
> > +const struct file_operations shiftfs_file_operations = {
> > +	.open			= shiftfs_open,
> > +	.release		= shiftfs_release,
> > +	.llseek			= shiftfs_llseek,
> > +	.read_iter		= shiftfs_read_iter,
> > +	.write_iter		= shiftfs_write_iter,
> > +	.fsync			= shiftfs_fsync,
> > +	.mmap			= shiftfs_mmap,
> > +	.fallocate		= shiftfs_fallocate,
> > +	.fadvise		= shiftfs_fadvise,
> > +	.unlocked_ioctl		= shiftfs_ioctl,
> > +	.compat_ioctl		= shiftfs_compat_ioctl,
> > +	.copy_file_range	= shiftfs_copy_file_range,
> > +	.remap_file_range	= shiftfs_remap_file_range,
> > +};
> > +
> > +const struct file_operations shiftfs_dir_operations = {
> > +	.compat_ioctl		= shiftfs_compat_ioctl,
> > +	.fsync			= shiftfs_fsync,
> > +	.iterate_shared		= shiftfs_iterate_shared,
> > +	.llseek			= shiftfs_llseek,
> > +	.open			= shiftfs_open,
> > +	.read			= generic_read_dir,
> > +	.release		= shiftfs_release,
> > +	.unlocked_ioctl		= shiftfs_ioctl,
> > +};
> > +
> > +static const struct address_space_operations shiftfs_aops = {
> > +	/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
> > +	.direct_IO	= noop_direct_IO,
> > +};
> > +
> > +static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
> > +			       umode_t mode, dev_t dev, struct dentry *dentry)
> > +{
> > +	struct inode *loweri;
> > +
> > +	inode->i_ino = ino;
> > +	inode->i_flags |= S_NOCMTIME;
> > +
> > +	mode &= S_IFMT;
> > +	inode->i_mode = mode;
> > +	switch (mode & S_IFMT) {
> > +	case S_IFDIR:
> > +		inode->i_op = &shiftfs_dir_inode_operations;
> > +		inode->i_fop = &shiftfs_dir_operations;
> > +		break;
> > +	case S_IFLNK:
> > +		inode->i_op = &shiftfs_symlink_inode_operations;
> > +		break;
> > +	case S_IFREG:
> > +		inode->i_op = &shiftfs_file_inode_operations;
> > +		inode->i_fop = &shiftfs_file_operations;
> > +		inode->i_mapping->a_ops = &shiftfs_aops;
> > +		break;
> > +	default:
> > +		inode->i_op = &shiftfs_special_inode_operations;
> > +		init_special_inode(inode, mode, dev);
> > +		break;
> > +	}
> > +
> > +	if (!dentry)
> > +		return;
> > +
> > +	loweri = dentry->d_inode;
> > +	if (!loweri->i_op->get_link)
> > +		inode->i_opflags |= IOP_NOFOLLOW;
> > +
> > +	shiftfs_copyattr(loweri, inode);
> > +	shiftfs_copyflags(loweri, inode);
> > +	set_nlink(inode, loweri->i_nlink);
> > +}
> > +
> > +static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
> > +{
> > +	struct super_block *sb = dentry->d_sb;
> > +	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
> > +
> > +	if (sbinfo->mark)
> > +		seq_show_option(m, "mark", NULL);
> > +
> > +	if (sbinfo->passthrough)
> > +		seq_printf(m, ",passthrough=%u", sbinfo->passthrough);
> > +
> > +	return 0;
> > +}
> > +
> > +static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
> > +{
> > +	struct super_block *sb = dentry->d_sb;
> > +	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
> > +	struct dentry *root = sb->s_root;
> > +	struct dentry *realroot = root->d_fsdata;
> > +	struct path realpath = { .mnt = sbinfo->mnt, .dentry = realroot };
> > +	int err;
> > +
> > +	err = vfs_statfs(&realpath, buf);
> >  	if (err)
> > -		goto out;
> > +		return err;
> >  
> > -	/* to mark a mount point, must be real root */
> > -	if (ssi->mark && !capable(CAP_SYS_ADMIN))
> > -		goto out;
> > +	if (!shiftfs_passthrough_statfs(sbinfo))
> > +		buf->f_type = sb->s_magic;
> >  
> > -	/* else to mount a mark, must be userns admin */
> > -	if (!ssi->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > -		goto out;
> > +	return 0;
> > +}
> >  
> > -	err = kern_path(name, LOOKUP_FOLLOW, &path);
> > +static void shiftfs_evict_inode(struct inode *inode)
> > +{
> > +	struct inode *loweri = inode->i_private;
> > +
> > +	clear_inode(inode);
> > +
> > +	if (loweri)
> > +		iput(loweri);
> > +}
> > +
> > +static void shiftfs_put_super(struct super_block *sb)
> > +{
> > +	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
> > +
> > +	if (sbinfo) {
> > +		mntput(sbinfo->mnt);
> > +		put_cred(sbinfo->creator_cred);
> > +		kfree(sbinfo);
> > +	}
> > +}
> > +
> > +static const struct xattr_handler shiftfs_xattr_handler = {
> > +	.prefix = "",
> > +	.get    = shiftfs_xattr_get,
> > +	.set    = shiftfs_xattr_set,
> > +};
> > +
> > +const struct xattr_handler *shiftfs_xattr_handlers[] = {
> > +#ifdef CONFIG_SHIFT_FS_POSIX_ACL
> > +	&shiftfs_posix_acl_access_xattr_handler,
> > +	&shiftfs_posix_acl_default_xattr_handler,
> > +#endif
> > +	&shiftfs_xattr_handler,
> > +	NULL
> > +};
> > +
> > +static inline bool passthrough_is_subset(int old_flags, int new_flags)
> > +{
> > +	if ((new_flags & old_flags) != new_flags)
> > +		return false;
> > +
> > +	return true;
> > +}
> > +
> > +static int shiftfs_remount(struct super_block *sb, int *flags, char *data)
> > +{
> > +	int err;
> > +	struct shiftfs_super_info new = {};
> > +	struct shiftfs_super_info *info = sb->s_fs_info;
> > +
> > +	err = shiftfs_parse_mount_options(&new, data);
> >  	if (err)
> > -		goto out;
> > +		return err;
> > +
> > +	/* Mark mount option cannot be changed. */
> > +	if (info->mark || (info->mark != new.mark))
> > +		return -EPERM;
> > +
> > +	if (info->passthrough != new.passthrough) {
> > +		/* Don't allow exceeding passthrough options of mark mount. */
> > +		if (!passthrough_is_subset(info->info_mark->passthrough,
> > +					   info->passthrough))
> > +			return -EPERM;
> > +
> > +		info->passthrough = new.passthrough;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static const struct super_operations shiftfs_super_ops = {
> > +	.put_super	= shiftfs_put_super,
> > +	.show_options	= shiftfs_show_options,
> > +	.statfs		= shiftfs_statfs,
> > +	.remount_fs	= shiftfs_remount,
> > +	.evict_inode	= shiftfs_evict_inode,
> > +};
> > +
> > +struct shiftfs_data {
> > +	void *data;
> > +	const char *path;
> > +};
> > +
> > +static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
> > +			      int silent)
> > +{
> > +	int err;
> > +	struct path path = {};
> > +	struct shiftfs_super_info *sbinfo_mp;
> > +	char *name = NULL;
> > +	struct inode *inode = NULL;
> > +	struct dentry *dentry = NULL;
> > +	struct shiftfs_data *data = raw_data;
> > +	struct shiftfs_super_info *sbinfo = NULL;
> > +
> > +	if (!data->path)
> > +		return -EINVAL;
> >  
> > -	err = -EPERM;
> > +	sb->s_fs_info = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
> > +	if (!sb->s_fs_info)
> > +		return -ENOMEM;
> > +	sbinfo = sb->s_fs_info;
> > +
> > +	err = shiftfs_parse_mount_options(sbinfo, data->data);
> > +	if (err)
> > +		return err;
> > +
> > +	/* to mount a mark, must be userns admin */
> > +	if (!sbinfo->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> > +		return -EPERM;
> > +
> > +	name = kstrdup(data->path, GFP_KERNEL);
> > +	if (!name)
> > +		return -ENOMEM;
> > +
> > +	err = kern_path(name, LOOKUP_FOLLOW, &path);
> > +	if (err)
> > +		goto out_free_name;
> >  
> >  	if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
> >  		err = -ENOTDIR;
> > -		goto out_put;
> > +		goto out_put_path;
> >  	}
> >  
> > -	sb->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
> > -	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
> > -		printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
> > -		err = -EINVAL;
> > -		goto out_put;
> > -	}
> > +	if (sbinfo->mark) {
> > +		struct super_block *lower_sb = path.mnt->mnt_sb;
> > +
> > +		/* to mark a mount point, must root wrt lower s_user_ns */
> > +		if (!ns_capable(lower_sb->s_user_ns, CAP_SYS_ADMIN)) {
> > +			err = -EPERM;
> > +			goto out_put_path;
> > +		}
> >  
> > -	if (ssi->mark) {
> >  		/*
> >  		 * this part is visible unshifted, so make sure no
> >  		 * executables that could be used to give suid
> >  		 * privileges
> >  		 */
> >  		sb->s_iflags = SB_I_NOEXEC;
> > -		ssi->mnt = path.mnt;
> > -		dentry = path.dentry;
> > -	} else {
> > -		struct shiftfs_super_info *mp_ssi;
> >  
> >  		/*
> > -		 * this leg executes if we're admin capable in
> > -		 * the namespace, so be very careful
> > +		 * Handle nesting of shiftfs mounts by referring this mark
> > +		 * mount back to the original mark mount. This is more
> > +		 * efficient and alleviates concerns about stack depth.
> >  		 */
> > +		if (lower_sb->s_magic == SHIFTFS_MAGIC) {
> > +			sbinfo_mp = lower_sb->s_fs_info;
> > +
> > +			/* Doesn't make sense to mark a mark mount */
> > +			if (sbinfo_mp->mark) {
> > +				err = -EINVAL;
> > +				goto out_put_path;
> > +			}
> > +
> > +			if (!passthrough_is_subset(sbinfo_mp->passthrough,
> > +						   sbinfo->passthrough)) {
> > +				err = -EPERM;
> > +				goto out_put_path;
> > +			}
> > +
> > +			sbinfo->mnt = mntget(sbinfo_mp->mnt);
> > +			dentry = dget(path.dentry->d_fsdata);
> > +		} else {
> > +			sbinfo->mnt = mntget(path.mnt);
> > +			dentry = dget(path.dentry);
> > +		}
> > +
> > +		sbinfo->creator_cred = prepare_creds();
> > +		if (!sbinfo->creator_cred) {
> > +			err = -ENOMEM;
> > +			goto out_put_path;
> > +		}
> > +	} else {
> > +		/*
> > +		 * This leg executes if we're admin capable in the namespace,
> > +		 * so be very careful.
> > +		 */
> > +		err = -EPERM;
> >  		if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
> > -			goto out_put;
> > -		mp_ssi = path.dentry->d_sb->s_fs_info;
> > -		if (!mp_ssi->mark)
> > -			goto out_put;
> > -		ssi->mnt = mntget(mp_ssi->mnt);
> > +			goto out_put_path;
> > +
> > +		sbinfo_mp = path.dentry->d_sb->s_fs_info;
> > +		if (!sbinfo_mp->mark)
> > +			goto out_put_path;
> > +
> > +		if (!passthrough_is_subset(sbinfo_mp->passthrough,
> > +					   sbinfo->passthrough))
> > +			goto out_put_path;
> > +
> > +		sbinfo->mnt = mntget(sbinfo_mp->mnt);
> > +		sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
> >  		dentry = dget(path.dentry->d_fsdata);
> > -		path_put(&path);
> > +		sbinfo->info_mark = sbinfo_mp;
> > +	}
> > +
> > +	sb->s_stack_depth = dentry->d_sb->s_stack_depth + 1;
> > +	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
> > +		printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
> > +		err = -EINVAL;
> > +		goto out_put_path;
> > +	}
> > +
> > +	inode = new_inode(sb);
> > +	if (!inode) {
> > +		err = -ENOMEM;
> > +		goto out_put_path;
> >  	}
> > -	ssi->userns = get_user_ns(dentry->d_sb->s_user_ns);
> > -	sb->s_fs_info = ssi;
> > +	shiftfs_fill_inode(inode, dentry->d_inode->i_ino, S_IFDIR, 0, dentry);
> > +
> > +	ihold(dentry->d_inode);
> > +	inode->i_private = dentry->d_inode;
> > +
> >  	sb->s_magic = SHIFTFS_MAGIC;
> >  	sb->s_op = &shiftfs_super_ops;
> >  	sb->s_xattr = shiftfs_xattr_handlers;
> >  	sb->s_d_op = &shiftfs_dentry_ops;
> > -	sb->s_root = d_make_root(shiftfs_new_inode(sb, S_IFDIR, dentry));
> > +	sb->s_flags |= SB_POSIXACL;
> > +	sb->s_root = d_make_root(inode);
> > +	if (!sb->s_root) {
> > +		err = -ENOMEM;
> > +		goto out_put_path;
> > +	}
> > +
> >  	sb->s_root->d_fsdata = dentry;
> > +	sbinfo->userns = get_user_ns(dentry->d_sb->s_user_ns);
> > +	shiftfs_copyattr(dentry->d_inode, sb->s_root->d_inode);
> >  
> > -	return 0;
> > +	dentry = NULL;
> > +	err = 0;
> >  
> > - out_put:
> > +out_put_path:
> >  	path_put(&path);
> > - out:
> > +
> > +out_free_name:
> >  	kfree(name);
> > -	kfree(ssi);
> > +
> > +	dput(dentry);
> > +
> >  	return err;
> >  }
> >  
> > @@ -764,17 +1864,26 @@ static struct file_system_type shiftfs_type = {
> >  
> >  static int __init shiftfs_init(void)
> >  {
> > +	shiftfs_file_info_cache = kmem_cache_create(
> > +		"shiftfs_file_info_cache", sizeof(struct shiftfs_file_info), 0,
> > +		SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | SLAB_MEM_SPREAD, NULL);
> > +	if (!shiftfs_file_info_cache)
> > +		return -ENOMEM;
> > +
> >  	return register_filesystem(&shiftfs_type);
> >  }
> >  
> >  static void __exit shiftfs_exit(void)
> >  {
> >  	unregister_filesystem(&shiftfs_type);
> > +	kmem_cache_destroy(shiftfs_file_info_cache);
> >  }
> >  
> >  MODULE_ALIAS_FS("shiftfs");
> >  MODULE_AUTHOR("James Bottomley");
> > -MODULE_DESCRIPTION("uid/gid shifting bind filesystem");
> > +MODULE_AUTHOR("Seth Forshee <seth.forshee at canonical.com>");
> > +MODULE_AUTHOR("Christian Brauner <christian.brauner at ubuntu.com>");
> > +MODULE_DESCRIPTION("id shifting filesystem");
> >  MODULE_LICENSE("GPL v2");
> >  module_init(shiftfs_init)
> >  module_exit(shiftfs_exit)
> > -- 
> > 2.20.1
> > 
> > 
> > -- 
> > kernel-team mailing list
> > kernel-team at lists.ubuntu.com
> > https://lists.ubuntu.com/mailman/listinfo/kernel-team



More information about the kernel-team mailing list