[PATCH 2/4][DISCO] shiftfs: rework and extend

Christian Brauner christian at brauner.io
Wed Mar 27 14:11:26 UTC 2019


From: Christian Brauner <christian at brauner.io>

/* Introduction */
The shiftfs filesystem is implemented as a stacking filesystem. Since it is
a stacking filesystem it shares concepts with overlayfs and ecryptfs.
Usually, shiftfs will be stacked upon another filesystem. The filesystem on
top - shiftfs - is referred to as "upper filesystem" or "overlay" and the
filesystem it is stacked upon is referred to as "lower filesystem" or
"underlay".

/* Marked and Unmarked shiftfs mounts */
To use shiftfs it is necessary that a given mount is marked as shiftable via
the "mark" mount option. Any mount of shiftfs without the "mark" mount option
not on top of a shiftfs mount with the "mark" mount option will be refused with
EPERM.
After a marked shiftfs mount has been performed other shiftfs mounts
referencing the marked shiftfs mount can be created. These secondary shiftfs
mounts are usually what are of interest.
The marked shiftfs mount will take a reference to the underlying mountpoint of
the directory it is marking as shiftable. Any unmarked shiftfts mounts
referencing this marked shifts mount will take a second reference to this
directory as well. This ensures that the underlying marked shiftfs mount can be
unmounted thereby dropping the reference to the underlying directory without
invalidating the mountpoint of said directory since the non-marked shiftfs
mount still holds another reference to it.

/* Stacking Depth */
Shiftfs tries to keep the stack as flat as possible to avoid hitting the
kernel enforced filesystem stacking limit.

/* Permission Model */
When the mark shiftfs mount is created shiftfs will record the credentials of
the creator of the super block and stash it in the super block. When other
non-mark shiftfs mounts are created that reference the mark shiftfs mount they
will stash another reference to the creators credentials. Before calling into
the underlying filesystem shiftfs will switch to the creators credentials and
revert to the original credentials after the underlying filesystem operation
returns.

/* Mount Options */
- mark
  When set the mark mount option indicates that the mount in question is
  allowed to be shifted. Since shiftfs it mountable in by user namespace root
  non-initial user namespace this mount options ensures that the system
  administrator has decided that the marked mount is safe to be shifted.
  To mark a mount as shiftable CAP_SYS_ADMIN in the user namespace is required.
- passthrough={0,1,2,3}
  This mount options functions as a bitmask. When set to a non-zero value
  shiftfs will try to act as an invisible shim sitting on top of the
  underlying filesystem.
  - 1: Shifts will report the filesystem type of the underlay for stat-like
       system calls.
  - 2: Shiftfs will passthrough whitelisted ioctl() to the underlay.
  - 3: Shiftfs will both use 1 and 2.
Note that mount options on a marked mount cannot be changed.

/* Extended Attributes */
Shiftfs will make sure to translate extended attributes.

/* Inodes Numbers */
Shiftfs inodes numbers are copied up from the underlying filesystem, i.e.
shiftfs inode numbers will be identical to the corresponding underlying
filesystem's inode numbers. This has the advantage that inotify and friends
should work out of the box.
(In essence, shiftfs is nothing but a 1:1 mirror of the underlying filesystem's
 dentries and inodes.)

/* Device Support */
Shiftfs only supports the creation of pipe and socket devices. Character and
block devices cannot be created through shiftfs.

Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
Signed-off-by: Seth Forshee <seth.forshee at canonical.com>
---
 fs/Kconfig   |   10 +
 fs/shiftfs.c | 1847 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 1488 insertions(+), 369 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 22e2e0feba0c..7b9ba073d403 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -121,6 +121,16 @@ config SHIFT_FS
 	  unprivileged containers can use this to mount root volumes
 	  using this technique.
 
+config SHIFT_FS_POSIX_ACL
+	bool "shiftfs POSIX Access Control Lists"
+	depends on SHIFT_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  If you don't know what Access Control Lists are, say N.
+
 menu "Caches"
 
 source "fs/fscache/Kconfig"
diff --git a/fs/shiftfs.c b/fs/shiftfs.c
index f7cada126daa..234af4e31736 100644
--- a/fs/shiftfs.c
+++ b/fs/shiftfs.c
@@ -1,3 +1,4 @@
+#include <linux/capability.h>
 #include <linux/cred.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -7,83 +8,179 @@
 #include <linux/kernel.h>
 #include <linux/magic.h>
 #include <linux/parser.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/statfs.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/uidgid.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
 
 struct shiftfs_super_info {
 	struct vfsmount *mnt;
 	struct user_namespace *userns;
+	/* creds of process who created the super block */
+	const struct cred *creator_cred;
 	bool mark;
+	unsigned int passthrough;
+	struct shiftfs_super_info *info_mark;
 };
 
-static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
-				       struct dentry *dentry);
+struct shiftfs_file_info {
+	struct path realpath;
+	struct file *realfile;
+};
+
+struct kmem_cache *shiftfs_file_info_cache;
+
+static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
+			       umode_t mode, dev_t dev, struct dentry *dentry);
+
+#define SHIFTFS_PASSTHROUGH_NONE 0
+#define SHIFTFS_PASSTHROUGH_STAT 1
+#define SHIFTFS_PASSTHROUGH_ALL (SHIFTFS_PASSTHROUGH_STAT)
+
+static inline bool shiftfs_passthrough_statfs(struct shiftfs_super_info *info)
+{
+	if (!(info->passthrough & SHIFTFS_PASSTHROUGH_STAT))
+		return false;
+
+	if (info->info_mark &&
+	    !(info->info_mark->passthrough & SHIFTFS_PASSTHROUGH_STAT))
+		return false;
+
+	return true;
+}
 
 enum {
 	OPT_MARK,
+	OPT_PASSTHROUGH,
 	OPT_LAST,
 };
 
 /* global filesystem options */
 static const match_table_t tokens = {
 	{ OPT_MARK, "mark" },
+	{ OPT_PASSTHROUGH, "passthrough=%u" },
 	{ OPT_LAST, NULL }
 };
 
-static const struct cred *shiftfs_get_up_creds(struct super_block *sb)
+static const struct cred *shiftfs_override_creds(const struct super_block *sb)
 {
-	struct shiftfs_super_info *ssi = sb->s_fs_info;
-	struct cred *cred = prepare_creds();
+	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
 
-	if (!cred)
-		return NULL;
+	return override_creds(sbinfo->creator_cred);
+}
+
+static inline void shiftfs_revert_object_creds(const struct cred *oldcred,
+					       struct cred *newcred)
+{
+	revert_creds(oldcred);
+	put_cred(newcred);
+}
+
+static int shiftfs_override_object_creds(const struct super_block *sb,
+					 const struct cred **oldcred,
+					 struct cred **newcred,
+					 struct dentry *dentry, umode_t mode,
+					 bool hardlink)
+{
+	kuid_t fsuid = current_fsuid();
+	kgid_t fsgid = current_fsgid();
+
+	*oldcred = shiftfs_override_creds(sb);
+
+	*newcred = prepare_creds();
+	if (!*newcred) {
+		revert_creds(*oldcred);
+		return -ENOMEM;
+	}
+
+	(*newcred)->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, fsuid));
+	(*newcred)->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, fsgid));
+
+	if (!hardlink) {
+		int err = security_dentry_create_files_as(dentry, mode,
+							  &dentry->d_name,
+							  *oldcred, *newcred);
+		if (err) {
+			shiftfs_revert_object_creds(*oldcred, *newcred);
+			return err;
+		}
+	}
 
-	cred->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, cred->fsuid));
-	cred->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, cred->fsgid));
-	put_user_ns(cred->user_ns);
-	cred->user_ns = get_user_ns(ssi->userns);
+	put_cred(override_creds(*newcred));
+	return 0;
+}
 
-	return cred;
+static kuid_t shift_kuid(struct user_namespace *from, struct user_namespace *to,
+			 kuid_t kuid)
+{
+	uid_t uid = from_kuid(from, kuid);
+	return make_kuid(to, uid);
 }
 
-static const struct cred *shiftfs_new_creds(const struct cred **newcred,
-					    struct super_block *sb)
+static kgid_t shift_kgid(struct user_namespace *from, struct user_namespace *to,
+			 kgid_t kgid)
 {
-	const struct cred *cred = shiftfs_get_up_creds(sb);
+	gid_t gid = from_kgid(from, kgid);
+	return make_kgid(to, gid);
+}
 
-	*newcred = cred;
+static void shiftfs_copyattr(struct inode *from, struct inode *to)
+{
+	struct user_namespace *from_ns = from->i_sb->s_user_ns;
+	struct user_namespace *to_ns = to->i_sb->s_user_ns;
+
+	to->i_uid = shift_kuid(from_ns, to_ns, from->i_uid);
+	to->i_gid = shift_kgid(from_ns, to_ns, from->i_gid);
+	to->i_mode = from->i_mode;
+	to->i_atime = from->i_atime;
+	to->i_mtime = from->i_mtime;
+	to->i_ctime = from->i_ctime;
+	i_size_write(to, i_size_read(from));
+}
 
-	if (cred)
-		cred = override_creds(cred);
-	else
-		printk(KERN_ERR "shiftfs: Credential override failed: no memory\n");
+static void shiftfs_copyflags(struct inode *from, struct inode *to)
+{
+	unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME;
 
-	return cred;
+	inode_set_flags(to, from->i_flags & mask, mask);
 }
 
-static void shiftfs_old_creds(const struct cred *oldcred,
-			      const struct cred **newcred)
+static void shiftfs_file_accessed(struct file *file)
 {
-	if (!*newcred)
+	struct inode *upperi, *loweri;
+
+	if (file->f_flags & O_NOATIME)
 		return;
 
-	revert_creds(oldcred);
-	put_cred(*newcred);
+	upperi = file_inode(file);
+	loweri = upperi->i_private;
+
+	if (!loweri)
+		return;
+
+	upperi->i_mtime = loweri->i_mtime;
+	upperi->i_ctime = loweri->i_ctime;
+
+	touch_atime(&file->f_path);
 }
 
-static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
+static int shiftfs_parse_mount_options(struct shiftfs_super_info *sbinfo,
+				       char *options)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 
-	ssi->mark = false;
+	sbinfo->mark = false;
+	sbinfo->passthrough = 0;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+		int err, intarg, token;
 
 		if (!*p)
 			continue;
@@ -91,121 +188,140 @@ static int shiftfs_parse_options(struct shiftfs_super_info *ssi, char *options)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case OPT_MARK:
-			ssi->mark = true;
+			sbinfo->mark = true;
+			break;
+		case OPT_PASSTHROUGH:
+			err = match_int(&args[0], &intarg);
+			if (err)
+				return err;
+
+			if (intarg & ~SHIFTFS_PASSTHROUGH_ALL)
+				return -EINVAL;
+
+			sbinfo->passthrough = intarg;
 			break;
 		default:
 			return -EINVAL;
 		}
 	}
+
 	return 0;
 }
 
 static void shiftfs_d_release(struct dentry *dentry)
 {
-	struct dentry *real = dentry->d_fsdata;
+	struct dentry *lowerd = dentry->d_fsdata;
 
-	dput(real);
+	if (lowerd)
+		dput(lowerd);
 }
 
 static struct dentry *shiftfs_d_real(struct dentry *dentry,
 				     const struct inode *inode)
 {
-	struct dentry *real = dentry->d_fsdata;
+	struct dentry *lowerd = dentry->d_fsdata;
+
+	if (inode && d_inode(dentry) == inode)
+		return dentry;
 
-	if (unlikely(real->d_flags & DCACHE_OP_REAL))
-		return real->d_op->d_real(real, real->d_inode);
+	lowerd = d_real(lowerd, inode);
+	if (lowerd && (!inode || inode == d_inode(lowerd)))
+		return lowerd;
 
-	return real;
+	WARN(1, "shiftfs_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
+	     inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+	return dentry;
 }
 
 static int shiftfs_d_weak_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *real = dentry->d_fsdata;
+	int err = 1;
+	struct dentry *lowerd = dentry->d_fsdata;
 
-	if (d_unhashed(real))
+	if (d_is_negative(lowerd) != d_is_negative(dentry))
 		return 0;
 
-	if (!(real->d_flags & DCACHE_OP_WEAK_REVALIDATE))
-		return 1;
+	if ((lowerd->d_flags & DCACHE_OP_WEAK_REVALIDATE))
+		err = lowerd->d_op->d_weak_revalidate(lowerd, flags);
 
-	return real->d_op->d_weak_revalidate(real, flags);
+	if (d_really_is_positive(dentry)) {
+		struct inode *inode = d_inode(dentry);
+		struct inode *loweri = d_inode(lowerd);
+
+		shiftfs_copyattr(loweri, inode);
+		if (!inode->i_nlink)
+			err = 0;
+	}
+
+	return err;
 }
 
 static int shiftfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *real = dentry->d_fsdata;
-	int ret;
+	int err = 1;
+	struct dentry *lowerd = dentry->d_fsdata;
 
-	if (d_unhashed(real))
+	if (d_unhashed(lowerd) ||
+	    ((d_is_negative(lowerd) != d_is_negative(dentry))))
 		return 0;
 
-	/*
-	 * inode state of underlying changed from positive to negative
-	 * or vice versa; force a lookup to update our view
-	 */
-	if (d_is_negative(real) != d_is_negative(dentry))
-		return 0;
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
 
-	if (!(real->d_flags & DCACHE_OP_REVALIDATE))
-		return 1;
+	if ((lowerd->d_flags & DCACHE_OP_REVALIDATE))
+		err = lowerd->d_op->d_revalidate(lowerd, flags);
 
-	ret = real->d_op->d_revalidate(real, flags);
+	if (d_really_is_positive(dentry)) {
+		struct inode *inode = d_inode(dentry);
+		struct inode *loweri = d_inode(lowerd);
 
-	if (ret == 0 && !(flags & LOOKUP_RCU))
-		d_invalidate(real);
+		shiftfs_copyattr(loweri, inode);
+		if (!inode->i_nlink)
+			err = 0;
+	}
 
-	return ret;
+	return err;
 }
 
 static const struct dentry_operations shiftfs_dentry_ops = {
-	.d_release	= shiftfs_d_release,
-	.d_real		= shiftfs_d_real,
-	.d_revalidate	= shiftfs_d_revalidate,
+	.d_release	   = shiftfs_d_release,
+	.d_real		   = shiftfs_d_real,
+	.d_revalidate	   = shiftfs_d_revalidate,
 	.d_weak_revalidate = shiftfs_d_weak_revalidate,
 };
 
-static int shiftfs_readlink(struct dentry *dentry, char __user *data,
-			    int flags)
-{
-	struct dentry *real = dentry->d_fsdata;
-	const struct inode_operations *iop = real->d_inode->i_op;
-
-	if (iop->readlink)
-		return iop->readlink(real, data, flags);
-
-	return -EINVAL;
-}
-
 static const char *shiftfs_get_link(struct dentry *dentry, struct inode *inode,
 				    struct delayed_call *done)
 {
-	if (dentry) {
-		struct dentry *real = dentry->d_fsdata;
-		struct inode *reali = real->d_inode;
-		const struct inode_operations *iop = reali->i_op;
-		const char *res = ERR_PTR(-EPERM);
-
-		if (iop->get_link)
-			res = iop->get_link(real, reali, done);
+	const char *p;
+	const struct cred *oldcred;
+	struct dentry *lowerd;
 
-		return res;
-	} else {
-		/* RCU lookup not supported */
+	/* RCU lookup not supported */
+	if (!dentry)
 		return ERR_PTR(-ECHILD);
-	}
+
+	lowerd = dentry->d_fsdata;
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	p = vfs_get_link(lowerd, done);
+	revert_creds(oldcred);
+
+	return p;
 }
 
 static int shiftfs_setxattr(struct dentry *dentry, struct inode *inode,
 			    const char *name, const void *value,
 			    size_t size, int flags)
 {
-	struct dentry *real = dentry->d_fsdata;
-	int err = -EOPNOTSUPP;
-	const struct cred *oldcred, *newcred;
+	struct dentry *lowerd = dentry->d_fsdata;
+	int err;
+	const struct cred *oldcred;
+
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	err = vfs_setxattr(lowerd, name, value, size, flags);
+	revert_creds(oldcred);
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	err = vfs_setxattr(real, name, value, size, flags);
-	shiftfs_old_creds(oldcred, &newcred);
+	shiftfs_copyattr(lowerd->d_inode, inode);
 
 	return err;
 }
@@ -214,13 +330,13 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
 			     struct dentry *dentry, struct inode *inode,
 			     const char *name, void *value, size_t size)
 {
-	struct dentry *real = dentry->d_fsdata;
+	struct dentry *lowerd = dentry->d_fsdata;
 	int err;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	err = vfs_getxattr(real, name, value, size);
-	shiftfs_old_creds(oldcred, &newcred);
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	err = vfs_getxattr(lowerd, name, value, size);
+	revert_creds(oldcred);
 
 	return err;
 }
@@ -228,26 +344,29 @@ static int shiftfs_xattr_get(const struct xattr_handler *handler,
 static ssize_t shiftfs_listxattr(struct dentry *dentry, char *list,
 				 size_t size)
 {
-	struct dentry *real = dentry->d_fsdata;
+	struct dentry *lowerd = dentry->d_fsdata;
 	int err;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	err = vfs_listxattr(real, list, size);
-	shiftfs_old_creds(oldcred, &newcred);
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	err = vfs_listxattr(lowerd, list, size);
+	revert_creds(oldcred);
 
 	return err;
 }
 
 static int shiftfs_removexattr(struct dentry *dentry, const char *name)
 {
-	struct dentry *real = dentry->d_fsdata;
+	struct dentry *lowerd = dentry->d_fsdata;
 	int err;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
+
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	err = vfs_removexattr(lowerd, name);
+	revert_creds(oldcred);
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	err = vfs_removexattr(real, name);
-	shiftfs_old_creds(oldcred, &newcred);
+	/* update c/mtime */
+	shiftfs_copyattr(lowerd->d_inode, d_inode(dentry));
 
 	return err;
 }
@@ -262,93 +381,157 @@ static int shiftfs_xattr_set(const struct xattr_handler *handler,
 	return shiftfs_setxattr(dentry, inode, name, value, size, flags);
 }
 
-static void shiftfs_fill_inode(struct inode *inode, struct dentry *dentry)
+static int shiftfs_inode_test(struct inode *inode, void *data)
 {
-	struct inode *reali;
-
-	if (!dentry)
-		return;
-
-	reali = dentry->d_inode;
-
-	if (!reali->i_op->get_link)
-		inode->i_opflags |= IOP_NOFOLLOW;
+	return inode->i_private == data;
+}
 
-	inode->i_mapping = reali->i_mapping;
-	inode->i_private = dentry;
+static int shiftfs_inode_set(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	return 0;
 }
 
-static int shiftfs_make_object(struct inode *dir, struct dentry *dentry,
-			       umode_t mode, const char *symlink,
-			       struct dentry *hardlink, bool excl)
+static int shiftfs_create_object(struct inode *diri, struct dentry *dentry,
+				 umode_t mode, const char *symlink,
+				 struct dentry *hardlink, bool excl)
 {
-	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
-	struct inode *reali = real->d_inode, *newi;
-	const struct inode_operations *iop = reali->i_op;
 	int err;
-	const struct cred *oldcred, *newcred;
-	bool op_ok = false;
+	const struct cred *oldcred;
+	struct cred *newcred;
+	void *loweri_iop_ptr = NULL;
+	umode_t modei = mode;
+	struct super_block *dir_sb = diri->i_sb;
+	struct dentry *lowerd_new = dentry->d_fsdata;
+	struct inode *inode = NULL, *loweri_dir = diri->i_private;
+	const struct inode_operations *loweri_dir_iop = loweri_dir->i_op;
+	struct dentry *lowerd_link = NULL;
 
 	if (hardlink) {
-		op_ok = iop->link;
+		loweri_iop_ptr = loweri_dir_iop->link;
 	} else {
 		switch (mode & S_IFMT) {
 		case S_IFDIR:
-			op_ok = iop->mkdir;
+			loweri_iop_ptr = loweri_dir_iop->mkdir;
 			break;
 		case S_IFREG:
-			op_ok = iop->create;
+			loweri_iop_ptr = loweri_dir_iop->create;
 			break;
 		case S_IFLNK:
-			op_ok = iop->symlink;
+			loweri_iop_ptr = loweri_dir_iop->symlink;
+			break;
+		case S_IFSOCK:
+			/* fall through */
+		case S_IFIFO:
+			loweri_iop_ptr = loweri_dir_iop->mknod;
+			break;
 		}
 	}
-	if (!op_ok)
-		return -EINVAL;
+	if (!loweri_iop_ptr) {
+		err = -EINVAL;
+		goto out_iput;
+	}
 
+	inode_lock_nested(loweri_dir, I_MUTEX_PARENT);
 
-	newi = shiftfs_new_inode(dentry->d_sb, mode, NULL);
-	if (!newi)
-		return -ENOMEM;
+	if (!hardlink) {
+		inode = new_inode(dir_sb);
+		if (!inode) {
+			err = -ENOMEM;
+			goto out_iput;
+		}
+
+		/*
+		 * new_inode() will have added the new inode to the super
+		 * block's list of inodes. Further below we will call
+		 * inode_insert5() Which would perform the same operation again
+		 * thereby corrupting the list. To avoid this raise I_CREATING
+		 * in i_state which will cause inode_insert5() to skip this
+		 * step. I_CREATING will be cleared by d_instantiate_new()
+		 * below.
+		 */
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_CREATING;
+		spin_unlock(&inode->i_lock);
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+		inode_init_owner(inode, diri, mode);
+		modei = inode->i_mode;
+	}
 
-	inode_lock_nested(reali, I_MUTEX_PARENT);
+	err = shiftfs_override_object_creds(dentry->d_sb, &oldcred, &newcred,
+					    dentry, modei, hardlink != NULL);
+	if (err)
+		goto out_iput;
 
-	err = -EINVAL;		/* shut gcc up about uninit var */
 	if (hardlink) {
-		struct dentry *realhardlink = hardlink->d_fsdata;
-
-		err = vfs_link(realhardlink, reali, new, NULL);
+		lowerd_link = hardlink->d_fsdata;
+		err = vfs_link(lowerd_link, loweri_dir, lowerd_new, NULL);
 	} else {
-		switch (mode & S_IFMT) {
+		switch (modei & S_IFMT) {
 		case S_IFDIR:
-			err = vfs_mkdir(reali, new, mode);
+			err = vfs_mkdir(loweri_dir, lowerd_new, modei);
 			break;
 		case S_IFREG:
-			err = vfs_create(reali, new, mode, excl);
+			err = vfs_create(loweri_dir, lowerd_new, modei, excl);
 			break;
 		case S_IFLNK:
-			err = vfs_symlink(reali, new, symlink);
+			err = vfs_symlink(loweri_dir, lowerd_new, symlink);
+			break;
+		case S_IFSOCK:
+			/* fall through */
+		case S_IFIFO:
+			err = vfs_mknod(loweri_dir, lowerd_new, modei, 0);
+			break;
+		default:
+			err = -EINVAL;
+			break;
 		}
 	}
 
-	shiftfs_old_creds(oldcred, &newcred);
+	shiftfs_revert_object_creds(oldcred, newcred);
 
+	if (!err && WARN_ON(!lowerd_new->d_inode))
+		err = -EIO;
 	if (err)
-		goto out_dput;
+		goto out_iput;
+
+	if (hardlink) {
+		inode = d_inode(hardlink);
+		ihold(inode);
+
+		/* copy up times from lower inode */
+		shiftfs_copyattr(d_inode(lowerd_link), inode);
+		set_nlink(d_inode(hardlink), d_inode(lowerd_link)->i_nlink);
+		d_instantiate(dentry, inode);
+	} else {
+		struct inode *inode_tmp;
+		struct inode *loweri_new = d_inode(lowerd_new);
+
+		inode_tmp = inode_insert5(inode, (unsigned long)loweri_new,
+					  shiftfs_inode_test, shiftfs_inode_set,
+					  loweri_new);
+		if (unlikely(inode_tmp != inode)) {
+			pr_err_ratelimited("shiftfs: newly created inode found in cache\n");
+			iput(inode_tmp);
+			err = -EINVAL;
+			goto out_iput;
+		}
 
-	shiftfs_fill_inode(newi, new);
+		ihold(loweri_new);
+		shiftfs_fill_inode(inode, loweri_new->i_ino, loweri_new->i_mode,
+				   0, lowerd_new);
+		d_instantiate_new(dentry, inode);
+	}
 
-	d_instantiate(dentry, newi);
+	shiftfs_copyattr(loweri_dir, diri);
+	if (loweri_iop_ptr == loweri_dir_iop->mkdir)
+		set_nlink(diri, loweri_dir->i_nlink);
 
-	new = NULL;
-	newi = NULL;
+	inode = NULL;
 
- out_dput:
-	dput(new);
-	iput(newi);
-	inode_unlock(reali);
+out_iput:
+	iput(inode);
+	inode_unlock(loweri_dir);
 
 	return err;
 }
@@ -358,7 +541,7 @@ static int shiftfs_create(struct inode *dir, struct dentry *dentry,
 {
 	mode |= S_IFREG;
 
-	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, excl);
+	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, excl);
 }
 
 static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
@@ -366,39 +549,52 @@ static int shiftfs_mkdir(struct inode *dir, struct dentry *dentry,
 {
 	mode |= S_IFDIR;
 
-	return shiftfs_make_object(dir, dentry, mode, NULL, NULL, false);
+	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
 }
 
 static int shiftfs_link(struct dentry *hardlink, struct inode *dir,
 			struct dentry *dentry)
 {
-	return shiftfs_make_object(dir, dentry, 0, NULL, hardlink, false);
+	return shiftfs_create_object(dir, dentry, 0, NULL, hardlink, false);
+}
+
+static int shiftfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+			 dev_t rdev)
+{
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		return -EPERM;
+
+	return shiftfs_create_object(dir, dentry, mode, NULL, NULL, false);
 }
 
 static int shiftfs_symlink(struct inode *dir, struct dentry *dentry,
 			   const char *symlink)
 {
-	return shiftfs_make_object(dir, dentry, S_IFLNK, symlink, NULL, false);
+	return shiftfs_create_object(dir, dentry, S_IFLNK, symlink, NULL, false);
 }
 
 static int shiftfs_rm(struct inode *dir, struct dentry *dentry, bool rmdir)
 {
-	struct dentry *real = dir->i_private, *new = dentry->d_fsdata;
-	struct inode *reali = real->d_inode;
+	struct dentry *lowerd = dentry->d_fsdata;
+	struct inode *loweri = dir->i_private;
 	int err;
-	const struct cred *oldcred, *newcred;
-
-	inode_lock_nested(reali, I_MUTEX_PARENT);
-
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
+	const struct cred *oldcred;
 
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	inode_lock_nested(loweri, I_MUTEX_PARENT);
 	if (rmdir)
-		err = vfs_rmdir(reali, new);
+		err = vfs_rmdir(loweri, lowerd);
 	else
-		err = vfs_unlink(reali, new, NULL);
+		err = vfs_unlink(loweri, lowerd, NULL);
+	inode_unlock(loweri);
+	revert_creds(oldcred);
 
-	shiftfs_old_creds(oldcred, &newcred);
-	inode_unlock(reali);
+	shiftfs_copyattr(loweri, dir);
+	set_nlink(d_inode(dentry), loweri->i_nlink);
+	if (!err)
+		d_drop(dentry);
+
+	set_nlink(dir, loweri->i_nlink);
 
 	return err;
 }
@@ -417,27 +613,30 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
 			  struct inode *newdir, struct dentry *new,
 			  unsigned int flags)
 {
-	struct dentry *rodd = olddir->i_private, *rndd = newdir->i_private,
-		*realold = old->d_fsdata,
-		*realnew = new->d_fsdata, *trap;
-	struct inode *realolddir = rodd->d_inode, *realnewdir = rndd->d_inode;
+	struct dentry *lowerd_dir_old = old->d_parent->d_fsdata,
+		      *lowerd_dir_new = new->d_parent->d_fsdata,
+		      *lowerd_old = old->d_fsdata, *lowerd_new = new->d_fsdata,
+		      *trapd;
+	struct inode *loweri_dir_old = lowerd_dir_old->d_inode,
+		     *loweri_dir_new = lowerd_dir_new->d_inode;
 	int err = -EINVAL;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
 
-	trap = lock_rename(rndd, rodd);
+	trapd = lock_rename(lowerd_dir_new, lowerd_dir_old);
 
-	if (trap == realold || trap == realnew)
+	if (trapd == lowerd_old || trapd == lowerd_new)
 		goto out_unlock;
 
-	oldcred = shiftfs_new_creds(&newcred, old->d_sb);
-
-	err = vfs_rename(realolddir, realold, realnewdir,
-			 realnew, NULL, flags);
+	oldcred = shiftfs_override_creds(old->d_sb);
+	err = vfs_rename(loweri_dir_old, lowerd_old, loweri_dir_new, lowerd_new,
+			 NULL, flags);
+	revert_creds(oldcred);
 
-	shiftfs_old_creds(oldcred, &newcred);
+	shiftfs_copyattr(loweri_dir_old, olddir);
+	shiftfs_copyattr(loweri_dir_new, newdir);
 
- out_unlock:
-	unlock_rename(rndd, rodd);
+out_unlock:
+	unlock_rename(lowerd_dir_new, lowerd_dir_old);
 
 	return err;
 }
@@ -445,304 +644,1205 @@ static int shiftfs_rename(struct inode *olddir, struct dentry *old,
 static struct dentry *shiftfs_lookup(struct inode *dir, struct dentry *dentry,
 				     unsigned int flags)
 {
-	struct dentry *real = dir->i_private, *new;
-	struct inode *reali = real->d_inode, *newi;
-	const struct cred *oldcred, *newcred;
-
-	inode_lock(reali);
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	new = lookup_one_len(dentry->d_name.name, real, dentry->d_name.len);
-	shiftfs_old_creds(oldcred, &newcred);
-	inode_unlock(reali);
+	struct dentry *new;
+	struct inode *newi;
+	const struct cred *oldcred;
+	struct dentry *lowerd = dentry->d_parent->d_fsdata;
+	struct inode *inode = NULL, *loweri = lowerd->d_inode;
+
+	inode_lock(loweri);
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	new = lookup_one_len(dentry->d_name.name, lowerd, dentry->d_name.len);
+	revert_creds(oldcred);
+	inode_unlock(loweri);
 
 	if (IS_ERR(new))
 		return new;
 
 	dentry->d_fsdata = new;
 
-	newi = NULL;
-	if (!new->d_inode)
+	newi = new->d_inode;
+	if (!newi)
 		goto out;
 
-	newi = shiftfs_new_inode(dentry->d_sb, new->d_inode->i_mode, new);
-	if (!newi) {
+	inode = iget5_locked(dentry->d_sb, (unsigned long)newi,
+			     shiftfs_inode_test, shiftfs_inode_set, newi);
+	if (!inode) {
 		dput(new);
 		return ERR_PTR(-ENOMEM);
 	}
+	if (inode->i_state & I_NEW) {
+		/*
+		 * inode->i_private set by shiftfs_inode_set(), but we still
+		 * need to take a reference
+		*/
+		ihold(newi);
+		shiftfs_fill_inode(inode, newi->i_ino, newi->i_mode, 0, new);
+		unlock_new_inode(inode);
+	}
 
- out:
-	return d_splice_alias(newi, dentry);
+out:
+	return d_splice_alias(inode, dentry);
 }
 
 static int shiftfs_permission(struct inode *inode, int mask)
 {
-	struct dentry *real = inode->i_private;
-	struct inode *reali = real->d_inode;
-	const struct inode_operations *iop = reali->i_op;
 	int err;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
+	struct inode *loweri = inode->i_private;
 
-	if (mask & MAY_NOT_BLOCK)
+	if (!loweri) {
+		WARN_ON(!(mask & MAY_NOT_BLOCK));
 		return -ECHILD;
+	}
 
-	oldcred = shiftfs_new_creds(&newcred, inode->i_sb);
-	if (iop->permission)
-		err = iop->permission(reali, mask);
-	else
-		err = generic_permission(reali, mask);
-	shiftfs_old_creds(oldcred, &newcred);
+	err = generic_permission(inode, mask);
+	if (err)
+		return err;
+
+	oldcred = shiftfs_override_creds(inode->i_sb);
+	err = inode_permission(loweri, mask);
+	revert_creds(oldcred);
+
+	return err;
+}
+
+static int shiftfs_fiemap(struct inode *inode,
+			  struct fiemap_extent_info *fieinfo, u64 start,
+			  u64 len)
+{
+	int err;
+	const struct cred *oldcred;
+	struct inode *loweri = inode->i_private;
+
+	if (!loweri->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	oldcred = shiftfs_override_creds(inode->i_sb);
+	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(loweri->i_mapping);
+	err = loweri->i_op->fiemap(loweri, fieinfo, start, len);
+	revert_creds(oldcred);
+
+	return err;
+}
+
+static int shiftfs_tmpfile(struct inode *dir, struct dentry *dentry,
+			   umode_t mode)
+{
+	int err;
+	const struct cred *oldcred;
+	struct dentry *lowerd = dentry->d_fsdata;
+	struct inode *loweri = dir->i_private;
+
+	if (!loweri->i_op->tmpfile)
+		return -EOPNOTSUPP;
+
+	oldcred = shiftfs_override_creds(dir->i_sb);
+	err = loweri->i_op->tmpfile(loweri, lowerd, mode);
+	revert_creds(oldcred);
 
 	return err;
 }
 
 static int shiftfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
-	struct dentry *real = dentry->d_fsdata;
-	struct inode *reali = real->d_inode;
-	const struct inode_operations *iop = reali->i_op;
+	struct dentry *lowerd = dentry->d_fsdata;
+	struct inode *loweri = lowerd->d_inode;
 	struct iattr newattr = *attr;
-	const struct cred *oldcred, *newcred;
+	const struct cred *oldcred;
 	struct super_block *sb = dentry->d_sb;
 	int err;
 
+	err = setattr_prepare(dentry, attr);
+	if (err)
+		return err;
+
 	newattr.ia_uid = KUIDT_INIT(from_kuid(sb->s_user_ns, attr->ia_uid));
 	newattr.ia_gid = KGIDT_INIT(from_kgid(sb->s_user_ns, attr->ia_gid));
 
-	oldcred = shiftfs_new_creds(&newcred, dentry->d_sb);
-	inode_lock(reali);
-	if (iop->setattr)
-		err = iop->setattr(real, &newattr);
-	else
-		err = simple_setattr(real, &newattr);
-	inode_unlock(reali);
-	shiftfs_old_creds(oldcred, &newcred);
+	inode_lock(loweri);
+	oldcred = shiftfs_override_creds(dentry->d_sb);
+	err = notify_change(lowerd, attr, NULL);
+	revert_creds(oldcred);
+	inode_unlock(loweri);
 
-	if (err)
-		return err;
+	shiftfs_copyattr(loweri, d_inode(dentry));
 
-	/* all OK, reflect the change on our inode */
-	setattr_copy(d_inode(dentry), attr);
-	return 0;
+	return err;
 }
 
 static int shiftfs_getattr(const struct path *path, struct kstat *stat,
 			   u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = path->dentry->d_inode;
-	struct dentry *real = path->dentry->d_fsdata;
-	struct inode *reali = real->d_inode;
-	const struct inode_operations *iop = reali->i_op;
-	struct path newpath = { .mnt = path->dentry->d_sb->s_fs_info, .dentry = real };
-	int err = 0;
-
-	if (iop->getattr)
-		err = iop->getattr(&newpath, stat, request_mask, query_flags);
-	else
-		generic_fillattr(reali, stat);
+	struct dentry *lowerd = path->dentry->d_fsdata;
+	struct inode *loweri = lowerd->d_inode;
+	struct shiftfs_super_info *info = path->dentry->d_sb->s_fs_info;
+	struct path newpath = { .mnt = info->mnt, .dentry = lowerd };
+	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
+	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
+	const struct cred *oldcred;
+	int err;
+
+	oldcred = shiftfs_override_creds(inode->i_sb);
+	err = vfs_getattr(&newpath, stat, request_mask, query_flags);
+	revert_creds(oldcred);
 
 	if (err)
 		return err;
 
 	/* transform the underlying id */
-	stat->uid = make_kuid(inode->i_sb->s_user_ns, __kuid_val(stat->uid));
-	stat->gid = make_kgid(inode->i_sb->s_user_ns, __kgid_val(stat->gid));
+	stat->uid = shift_kuid(from_ns, to_ns, stat->uid);
+	stat->gid = shift_kgid(from_ns, to_ns, stat->gid);
 	return 0;
 }
 
-static const struct inode_operations shiftfs_inode_ops = {
-	.lookup		= shiftfs_lookup,
-	.getattr	= shiftfs_getattr,
-	.setattr	= shiftfs_setattr,
-	.permission	= shiftfs_permission,
-	.mkdir		= shiftfs_mkdir,
-	.symlink	= shiftfs_symlink,
-	.get_link	= shiftfs_get_link,
-	.readlink	= shiftfs_readlink,
-	.unlink		= shiftfs_unlink,
-	.rmdir		= shiftfs_rmdir,
-	.rename		= shiftfs_rename,
-	.link		= shiftfs_link,
-	.create		= shiftfs_create,
-	.mknod		= NULL,	/* no special files currently */
-	.listxattr	= shiftfs_listxattr,
-};
+#ifdef CONFIG_SHIFT_FS_POSIX_ACL
 
-static struct inode *shiftfs_new_inode(struct super_block *sb, umode_t mode,
-				       struct dentry *dentry)
+static int
+shift_acl_ids(struct user_namespace *from, struct user_namespace *to,
+	      struct posix_acl *acl)
 {
-	struct inode *inode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return NULL;
-
-	/*
-	 * our inode is completely vestigial.  All lookups, getattr
-	 * and permission checks are done on the underlying inode, so
-	 * what the user sees is entirely from the underlying inode.
-	 */
-	mode &= S_IFMT;
+	int i;
+
+	for (i = 0; i < acl->a_count; i++) {
+		struct posix_acl_entry *e = &acl->a_entries[i];
+		switch(e->e_tag) {
+		case ACL_USER:
+			e->e_uid = shift_kuid(from, to, e->e_uid);
+			if (!uid_valid(e->e_uid))
+				return -EOVERFLOW;
+			break;
+		case ACL_GROUP:
+			e->e_gid = shift_kgid(from, to, e->e_gid);
+			if (!gid_valid(e->e_gid))
+				return -EOVERFLOW;
+			break;
+		}
+	}
+	return 0;
+}
 
-	inode->i_ino = get_next_ino();
-	inode->i_mode = mode;
-	inode->i_flags |= S_NOATIME | S_NOCMTIME;
+static void
+shift_acl_xattr_ids(struct user_namespace *from, struct user_namespace *to,
+		    void *value, size_t size)
+{
+	struct posix_acl_xattr_header *header = value;
+	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+	int count;
+	kuid_t kuid;
+	kgid_t kgid;
 
-	inode->i_op = &shiftfs_inode_ops;
+	if (!value)
+		return;
+	if (size < sizeof(struct posix_acl_xattr_header))
+		return;
+	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return;
 
-	shiftfs_fill_inode(inode, dentry);
+	count = posix_acl_xattr_count(size);
+	if (count < 0)
+		return;
+	if (count == 0)
+		return;
 
-	return inode;
+	for (end = entry + count; entry != end; entry++) {
+		switch(le16_to_cpu(entry->e_tag)) {
+		case ACL_USER:
+			kuid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+			kuid = shift_kuid(from, to, kuid);
+			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, kuid));
+			break;
+		case ACL_GROUP:
+			kgid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+			kgid = shift_kgid(from, to, kgid);
+			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, kgid));
+			break;
+		default:
+			break;
+		}
+	}
 }
 
-static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+static struct posix_acl *shiftfs_get_acl(struct inode *inode, int type)
 {
-	struct super_block *sb = dentry->d_sb;
-	struct shiftfs_super_info *ssi = sb->s_fs_info;
+	struct inode *loweri = inode->i_private;
+	const struct cred *oldcred;
+	struct posix_acl *lower_acl, *acl = NULL;
+	struct user_namespace *from_ns = loweri->i_sb->s_user_ns;
+	struct user_namespace *to_ns = inode->i_sb->s_user_ns;
+	int size;
+	int err;
 
-	if (ssi->mark)
-		seq_show_option(m, "mark", NULL);
+	if (!IS_POSIXACL(loweri))
+		return NULL;
 
-	return 0;
-}
+	oldcred = shiftfs_override_creds(inode->i_sb);
+	lower_acl = get_acl(loweri, type);
+	revert_creds(oldcred);
 
-static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct shiftfs_super_info *ssi = sb->s_fs_info;
-	struct dentry *root = sb->s_root;
-	struct dentry *realroot = root->d_fsdata;
-	struct path realpath = { .mnt = ssi->mnt, .dentry = realroot };
-	int err;
+	if (lower_acl && !IS_ERR(lower_acl)) {
+		/* XXX: export posix_acl_clone? */
+		size = sizeof(struct posix_acl) +
+		       lower_acl->a_count * sizeof(struct posix_acl_entry);
+		acl = kmemdup(lower_acl, size, GFP_KERNEL);
+		posix_acl_release(lower_acl);
 
-	err = vfs_statfs(&realpath, buf);
-	if (err)
-		return err;
+		if (!acl)
+			return ERR_PTR(-ENOMEM);
 
-	buf->f_type = sb->s_magic;
+		refcount_set(&acl->a_refcount, 1);
 
-	return 0;
+		err = shift_acl_ids(from_ns, to_ns, acl);
+		if (err) {
+			kfree(acl);
+			return ERR_PTR(err);
+		}
+	}
+
+	return acl;
 }
 
-static void shiftfs_put_super(struct super_block *sb)
+static int
+shiftfs_posix_acl_xattr_get(const struct xattr_handler *handler,
+			   struct dentry *dentry, struct inode *inode,
+			   const char *name, void *buffer, size_t size)
 {
-	struct shiftfs_super_info *ssi = sb->s_fs_info;
+	struct inode *loweri = inode->i_private;
+	int ret;
+
+	ret = shiftfs_xattr_get(NULL, dentry, inode, handler->name,
+				buffer, size);
+	if (ret < 0)
+		return ret;
 
-	mntput(ssi->mnt);
-	put_user_ns(ssi->userns);
-	kfree(ssi);
+	inode_lock(loweri);
+	shift_acl_xattr_ids(loweri->i_sb->s_user_ns, inode->i_sb->s_user_ns,
+			    buffer, size);
+	inode_unlock(loweri);
+	return ret;
 }
 
-static const struct xattr_handler shiftfs_xattr_handler = {
-	.prefix = "",
-	.get    = shiftfs_xattr_get,
-	.set    = shiftfs_xattr_set,
-};
+static int
+shiftfs_posix_acl_xattr_set(const struct xattr_handler *handler,
+			    struct dentry *dentry, struct inode *inode,
+			    const char *name, const void *value,
+			    size_t size, int flags)
+{
+	struct inode *loweri = inode->i_private;
+	int err;
 
-const struct xattr_handler *shiftfs_xattr_handlers[] = {
-	&shiftfs_xattr_handler,
-	NULL
-};
+	if (!IS_POSIXACL(loweri) || !loweri->i_op->set_acl)
+		return -EOPNOTSUPP;
+	if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+		return value ? -EACCES : 0;
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		shift_acl_xattr_ids(inode->i_sb->s_user_ns,
+				    loweri->i_sb->s_user_ns,
+				    (void *)value, size);
+		err = shiftfs_setxattr(dentry, inode, handler->name, value,
+				       size, flags);
+	} else {
+		err = shiftfs_removexattr(dentry, handler->name);
+	}
 
-static const struct super_operations shiftfs_super_ops = {
-	.put_super	= shiftfs_put_super,
-	.show_options	= shiftfs_show_options,
-	.statfs		= shiftfs_statfs,
+	if (!err)
+		shiftfs_copyattr(loweri, inode);
+
+	return err;
+}
+
+static const struct xattr_handler
+shiftfs_posix_acl_access_xattr_handler = {
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
+	.flags = ACL_TYPE_ACCESS,
+	.get = shiftfs_posix_acl_xattr_get,
+	.set = shiftfs_posix_acl_xattr_set,
 };
 
-struct shiftfs_data {
-	void *data;
-	const char *path;
+static const struct xattr_handler
+shiftfs_posix_acl_default_xattr_handler = {
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
+	.flags = ACL_TYPE_DEFAULT,
+	.get = shiftfs_posix_acl_xattr_get,
+	.set = shiftfs_posix_acl_xattr_set,
 };
 
-static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
-			      int silent)
-{
-	struct shiftfs_data *data = raw_data;
-	char *name = kstrdup(data->path, GFP_KERNEL);
-	int err = -ENOMEM;
-	struct shiftfs_super_info *ssi = NULL;
-	struct path path;
-	struct dentry *dentry;
+#else /* !CONFIG_SHIFT_FS_POSIX_ACL */
 
-	if (!name)
-		goto out;
+#define shiftfs_get_acl NULL
 
-	ssi = kzalloc(sizeof(*ssi), GFP_KERNEL);
-	if (!ssi)
-		goto out;
+#endif /* CONFIG_SHIFT_FS_POSIX_ACL */
 
-	err = -EPERM;
-	err = shiftfs_parse_options(ssi, data->data);
+static const struct inode_operations shiftfs_dir_inode_operations = {
+	.lookup		= shiftfs_lookup,
+	.mkdir		= shiftfs_mkdir,
+	.symlink	= shiftfs_symlink,
+	.unlink		= shiftfs_unlink,
+	.rmdir		= shiftfs_rmdir,
+	.rename		= shiftfs_rename,
+	.link		= shiftfs_link,
+	.setattr	= shiftfs_setattr,
+	.create		= shiftfs_create,
+	.mknod		= shiftfs_mknod,
+	.permission	= shiftfs_permission,
+	.getattr	= shiftfs_getattr,
+	.listxattr	= shiftfs_listxattr,
+	.get_acl	= shiftfs_get_acl,
+};
+
+static const struct inode_operations shiftfs_file_inode_operations = {
+	.fiemap		= shiftfs_fiemap,
+	.getattr	= shiftfs_getattr,
+	.get_acl	= shiftfs_get_acl,
+	.listxattr	= shiftfs_listxattr,
+	.permission	= shiftfs_permission,
+	.setattr	= shiftfs_setattr,
+	.tmpfile	= shiftfs_tmpfile,
+};
+
+static const struct inode_operations shiftfs_special_inode_operations = {
+	.getattr	= shiftfs_getattr,
+	.get_acl	= shiftfs_get_acl,
+	.listxattr	= shiftfs_listxattr,
+	.permission	= shiftfs_permission,
+	.setattr	= shiftfs_setattr,
+};
+
+static const struct inode_operations shiftfs_symlink_inode_operations = {
+	.getattr	= shiftfs_getattr,
+	.get_link	= shiftfs_get_link,
+	.listxattr	= shiftfs_listxattr,
+	.setattr	= shiftfs_setattr,
+};
+
+static struct file *shiftfs_open_realfile(const struct file *file,
+					  struct path *realpath)
+{
+	struct file *lowerf;
+	const struct cred *oldcred;
+	struct inode *inode = file_inode(file);
+	struct inode *loweri = realpath->dentry->d_inode;
+	struct shiftfs_super_info *info = inode->i_sb->s_fs_info;
+
+	oldcred = shiftfs_override_creds(inode->i_sb);
+	/* XXX: open_with_fake_path() not gauranteed to stay around, if
+	 * removed use dentry_open() */
+	lowerf = open_with_fake_path(realpath, file->f_flags, loweri, info->creator_cred);
+	revert_creds(oldcred);
+
+	return lowerf;
+}
+
+#define SHIFTFS_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)
+
+static int shiftfs_change_flags(struct file *file, unsigned int flags)
+{
+	struct inode *inode = file_inode(file);
+	int err;
+
+	/* if some flag changed that cannot be changed then something's amiss */
+	if (WARN_ON((file->f_flags ^ flags) & ~SHIFTFS_SETFL_MASK))
+		return -EIO;
+
+	flags &= SHIFTFS_SETFL_MASK;
+
+	if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
+		return -EPERM;
+
+	if (flags & O_DIRECT) {
+		if (!file->f_mapping->a_ops ||
+		    !file->f_mapping->a_ops->direct_IO)
+			return -EINVAL;
+	}
+
+	if (file->f_op->check_flags) {
+		err = file->f_op->check_flags(flags);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&file->f_lock);
+	file->f_flags = (file->f_flags & ~SHIFTFS_SETFL_MASK) | flags;
+	spin_unlock(&file->f_lock);
+
+	return 0;
+}
+
+static int shiftfs_real_fdget(const struct file *file, struct fd *lowerfd)
+{
+	struct shiftfs_file_info *file_info = file->private_data;
+	struct file *realfile = file_info->realfile;
+
+	lowerfd->flags = 0;
+	lowerfd->file = realfile;
+
+	/* Did the flags change since open? */
+	if (unlikely(file->f_flags & lowerfd->file->f_flags))
+		return shiftfs_change_flags(lowerfd->file, file->f_flags);
+
+	return 0;
+}
+
+static int shiftfs_open(struct inode *inode, struct file *file)
+{
+	struct shiftfs_super_info *ssi = inode->i_sb->s_fs_info;
+	struct shiftfs_file_info *file_info;
+	struct file *realfile;
+	struct path *realpath;
+
+	file_info = kmem_cache_zalloc(shiftfs_file_info_cache, GFP_KERNEL);
+	if (!file_info)
+		return -ENOMEM;
+	file->private_data = file_info;
+
+	realpath = &file_info->realpath;
+	realpath->mnt = ssi->mnt;
+	realpath->dentry = file->f_path.dentry->d_fsdata;
+
+	realfile = shiftfs_open_realfile(file, realpath);
+	if (IS_ERR(realfile)) {
+		kfree(file_info);
+		return PTR_ERR(realfile);
+	}
+
+	file_info->realfile = realfile;
+	return 0;
+}
+
+static int shiftfs_release(struct inode *inode, struct file *file)
+{
+	struct shiftfs_file_info *file_info = file->private_data;
+
+	fput(file_info->realfile);
+	kmem_cache_free(shiftfs_file_info_cache, file_info);
+	return 0;
+}
+
+static loff_t shiftfs_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *realinode = file_inode(file)->i_private;
+
+	return generic_file_llseek_size(file, offset, whence,
+					realinode->i_sb->s_maxbytes,
+					i_size_read(realinode));
+}
+
+/* XXX: Need to figure out what to to about atime updates, maybe other
+ * timestamps too ... ref. ovl_file_accessed() */
+
+static rwf_t shiftfs_iocb_to_rwf(struct kiocb *iocb)
+{
+	int ifl = iocb->ki_flags;
+	rwf_t flags = 0;
+
+	if (ifl & IOCB_NOWAIT)
+		flags |= RWF_NOWAIT;
+	if (ifl & IOCB_HIPRI)
+		flags |= RWF_HIPRI;
+	if (ifl & IOCB_DSYNC)
+		flags |= RWF_DSYNC;
+	if (ifl & IOCB_SYNC)
+		flags |= RWF_SYNC;
+
+	return flags;
+}
+
+static ssize_t shiftfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct fd lowerfd;
+	const struct cred *oldcred;
+	ssize_t ret;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		return ret;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	ret = vfs_iter_read(lowerfd.file, iter, &iocb->ki_pos,
+			    shiftfs_iocb_to_rwf(iocb));
+	revert_creds(oldcred);
+
+	shiftfs_file_accessed(file);
+
+	fdput(lowerfd);
+	return ret;
+}
+
+static ssize_t shiftfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct fd lowerfd;
+	const struct cred *oldcred;
+	ssize_t ret;
+
+	if (!iov_iter_count(iter))
+		return 0;
+
+	inode_lock(inode);
+	/* Update mode */
+	shiftfs_copyattr(inode->i_private, inode);
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out_unlock;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		goto out_unlock;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	file_start_write(lowerfd.file);
+	ret = vfs_iter_write(lowerfd.file, iter, &iocb->ki_pos,
+			     shiftfs_iocb_to_rwf(iocb));
+	file_end_write(lowerfd.file);
+	revert_creds(oldcred);
+
+	/* Update size */
+	shiftfs_copyattr(inode->i_private, inode);
+
+	fdput(lowerfd);
+
+out_unlock:
+	inode_unlock(inode);
+	return ret;
+}
+
+static int shiftfs_fsync(struct file *file, loff_t start, loff_t end,
+			 int datasync)
+{
+	struct fd lowerfd;
+	const struct cred *oldcred;
+	int ret;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		return ret;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	ret = vfs_fsync_range(lowerfd.file, start, end, datasync);
+	revert_creds(oldcred);
+
+	fdput(lowerfd);
+	return ret;
+}
+
+static int shiftfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct shiftfs_file_info *file_info = file->private_data;
+	struct file *realfile = file_info->realfile;
+	const struct cred *oldcred;
+	int ret;
+
+	if (!realfile->f_op->mmap)
+		return -ENODEV;
+
+	if (WARN_ON(file != vma->vm_file))
+		return -EIO;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	vma->vm_file = get_file(realfile);
+	ret = call_mmap(vma->vm_file, vma);
+	revert_creds(oldcred);
+
+	shiftfs_file_accessed(file);
+
+	if (ret)
+		fput(realfile); /* Drop refcount from new vm_file value */
+	else
+		fput(file); /* Drop refcount from previous vm_file value */
+
+	return ret;
+}
+
+static long shiftfs_fallocate(struct file *file, int mode, loff_t offset,
+			      loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct inode *loweri = inode->i_private;
+	struct fd lowerfd;
+	const struct cred *oldcred;
+	int ret;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		return ret;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	ret = vfs_fallocate(lowerfd.file, mode, offset, len);
+	revert_creds(oldcred);
+
+	/* Update size */
+	shiftfs_copyattr(loweri, inode);
+
+	fdput(lowerfd);
+	return ret;
+}
+
+static int shiftfs_fadvise(struct file *file, loff_t offset, loff_t len,
+			   int advice)
+{
+	struct fd lowerfd;
+	const struct cred *oldcred;
+	int ret;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		return ret;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	ret = vfs_fadvise(lowerfd.file, offset, len, advice);
+	revert_creds(oldcred);
+
+	fdput(lowerfd);
+	return ret;
+}
+
+static int shiftfs_override_ioctl_creds(const struct super_block *sb,
+					const struct cred **oldcred,
+					struct cred **newcred)
+{
+	kuid_t fsuid = current_fsuid();
+	kgid_t fsgid = current_fsgid();
+
+	*oldcred = shiftfs_override_creds(sb);
+
+	*newcred = prepare_creds();
+	if (!*newcred) {
+		revert_creds(*oldcred);
+		return -ENOMEM;
+	}
+
+	(*newcred)->fsuid = KUIDT_INIT(from_kuid(sb->s_user_ns, fsuid));
+	(*newcred)->fsgid = KGIDT_INIT(from_kgid(sb->s_user_ns, fsgid));
+
+	/* clear all caps to prevent bypassing capable() checks */
+	cap_clear((*newcred)->cap_bset);
+	cap_clear((*newcred)->cap_effective);
+	cap_clear((*newcred)->cap_inheritable);
+	cap_clear((*newcred)->cap_permitted);
+
+	put_cred(override_creds(*newcred));
+	return 0;
+}
+
+static inline void shiftfs_revert_ioctl_creds(const struct cred *oldcred,
+					      struct cred *newcred)
+{
+	return shiftfs_revert_object_creds(oldcred, newcred);
+}
+
+static long shiftfs_real_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	long ret = 0;
+	struct fd lowerfd;
+	struct cred *newcred;
+	const struct cred *oldcred;
+	struct super_block *sb = file->f_path.dentry->d_sb;
+
+	ret = shiftfs_real_fdget(file, &lowerfd);
+	if (ret)
+		return ret;
+
+	ret = shiftfs_override_ioctl_creds(sb, &oldcred, &newcred);
+	if (ret)
+		goto out_fdput;
+
+	ret = vfs_ioctl(lowerfd.file, cmd, arg);
+
+	shiftfs_revert_ioctl_creds(oldcred, newcred);
+
+	shiftfs_copyattr(file_inode(lowerfd.file), file_inode(file));
+	shiftfs_copyflags(file_inode(lowerfd.file), file_inode(file));
+
+out_fdput:
+	fdput(lowerfd);
+
+	return ret;
+}
+
+static long shiftfs_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC_GETVERSION:
+		/* fall through */
+	case FS_IOC_GETFLAGS:
+		/* fall through */
+	case FS_IOC_SETFLAGS:
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	return shiftfs_real_ioctl(file, cmd, arg);
+}
+
+static long shiftfs_compat_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETVERSION:
+		/* fall through */
+	case FS_IOC32_GETFLAGS:
+		/* fall through */
+	case FS_IOC32_SETFLAGS:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return shiftfs_real_ioctl(file, cmd, arg);
+}
+
+enum shiftfs_copyop {
+	SHIFTFS_COPY,
+	SHIFTFS_CLONE,
+	SHIFTFS_DEDUPE,
+};
+
+static ssize_t shiftfs_copyfile(struct file *file_in, loff_t pos_in,
+				struct file *file_out, loff_t pos_out, u64 len,
+				unsigned int flags, enum shiftfs_copyop op)
+{
+	ssize_t ret;
+	struct fd real_in, real_out;
+	const struct cred *oldcred;
+	struct inode *inode_out = file_inode(file_out);
+	struct inode *loweri = inode_out->i_private;
+
+	ret = shiftfs_real_fdget(file_out, &real_out);
+	if (ret)
+		return ret;
+
+	ret = shiftfs_real_fdget(file_in, &real_in);
+	if (ret) {
+		fdput(real_out);
+		return ret;
+	}
+
+	oldcred = shiftfs_override_creds(inode_out->i_sb);
+	switch (op) {
+	case SHIFTFS_COPY:
+		ret = vfs_copy_file_range(real_in.file, pos_in, real_out.file,
+					  pos_out, len, flags);
+		break;
+
+	case SHIFTFS_CLONE:
+		ret = vfs_clone_file_range(real_in.file, pos_in, real_out.file,
+					   pos_out, len, flags);
+		break;
+
+	case SHIFTFS_DEDUPE:
+		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
+						real_out.file, pos_out, len,
+						flags);
+		break;
+	}
+	revert_creds(oldcred);
+
+	/* Update size */
+	shiftfs_copyattr(loweri, inode_out);
+
+	fdput(real_in);
+	fdput(real_out);
+
+	return ret;
+}
+
+static ssize_t shiftfs_copy_file_range(struct file *file_in, loff_t pos_in,
+				       struct file *file_out, loff_t pos_out,
+				       size_t len, unsigned int flags)
+{
+	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
+				SHIFTFS_COPY);
+}
+
+static loff_t shiftfs_remap_file_range(struct file *file_in, loff_t pos_in,
+				       struct file *file_out, loff_t pos_out,
+				       loff_t len, unsigned int remap_flags)
+{
+	enum shiftfs_copyop op;
+
+	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+		return -EINVAL;
+
+	if (remap_flags & REMAP_FILE_DEDUP)
+		op = SHIFTFS_DEDUPE;
+	else
+		op = SHIFTFS_CLONE;
+
+	return shiftfs_copyfile(file_in, pos_in, file_out, pos_out, len,
+				remap_flags, op);
+}
+
+static int shiftfs_iterate_shared(struct file *file, struct dir_context *ctx)
+{
+	const struct cred *oldcred;
+	int err = -ENOTDIR;
+	struct shiftfs_file_info *file_info = file->private_data;
+	struct file *realfile = file_info->realfile;
+
+	oldcred = shiftfs_override_creds(file->f_path.dentry->d_sb);
+	err = iterate_dir(realfile, ctx);
+	revert_creds(oldcred);
+
+	return err;
+}
+
+const struct file_operations shiftfs_file_operations = {
+	.open			= shiftfs_open,
+	.release		= shiftfs_release,
+	.llseek			= shiftfs_llseek,
+	.read_iter		= shiftfs_read_iter,
+	.write_iter		= shiftfs_write_iter,
+	.fsync			= shiftfs_fsync,
+	.mmap			= shiftfs_mmap,
+	.fallocate		= shiftfs_fallocate,
+	.fadvise		= shiftfs_fadvise,
+	.unlocked_ioctl		= shiftfs_ioctl,
+	.compat_ioctl		= shiftfs_compat_ioctl,
+	.copy_file_range	= shiftfs_copy_file_range,
+	.remap_file_range	= shiftfs_remap_file_range,
+};
+
+const struct file_operations shiftfs_dir_operations = {
+	.compat_ioctl		= shiftfs_compat_ioctl,
+	.fsync			= shiftfs_fsync,
+	.iterate_shared		= shiftfs_iterate_shared,
+	.llseek			= shiftfs_llseek,
+	.open			= shiftfs_open,
+	.read			= generic_read_dir,
+	.release		= shiftfs_release,
+	.unlocked_ioctl		= shiftfs_ioctl,
+};
+
+static const struct address_space_operations shiftfs_aops = {
+	/* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+	.direct_IO	= noop_direct_IO,
+};
+
+static void shiftfs_fill_inode(struct inode *inode, unsigned long ino,
+			       umode_t mode, dev_t dev, struct dentry *dentry)
+{
+	struct inode *loweri;
+
+	inode->i_ino = ino;
+	inode->i_flags |= S_NOCMTIME;
+
+	mode &= S_IFMT;
+	inode->i_mode = mode;
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &shiftfs_dir_inode_operations;
+		inode->i_fop = &shiftfs_dir_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &shiftfs_symlink_inode_operations;
+		break;
+	case S_IFREG:
+		inode->i_op = &shiftfs_file_inode_operations;
+		inode->i_fop = &shiftfs_file_operations;
+		inode->i_mapping->a_ops = &shiftfs_aops;
+		break;
+	default:
+		inode->i_op = &shiftfs_special_inode_operations;
+		init_special_inode(inode, mode, dev);
+		break;
+	}
+
+	if (!dentry)
+		return;
+
+	loweri = dentry->d_inode;
+	if (!loweri->i_op->get_link)
+		inode->i_opflags |= IOP_NOFOLLOW;
+
+	shiftfs_copyattr(loweri, inode);
+	shiftfs_copyflags(loweri, inode);
+	set_nlink(inode, loweri->i_nlink);
+}
+
+static int shiftfs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+
+	if (sbinfo->mark)
+		seq_show_option(m, "mark", NULL);
+
+	if (sbinfo->passthrough)
+		seq_printf(m, ",passthrough=%u", sbinfo->passthrough);
+
+	return 0;
+}
+
+static int shiftfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+	struct dentry *root = sb->s_root;
+	struct dentry *realroot = root->d_fsdata;
+	struct path realpath = { .mnt = sbinfo->mnt, .dentry = realroot };
+	int err;
+
+	err = vfs_statfs(&realpath, buf);
 	if (err)
-		goto out;
+		return err;
 
-	/* to mark a mount point, must be real root */
-	if (ssi->mark && !capable(CAP_SYS_ADMIN))
-		goto out;
+	if (!shiftfs_passthrough_statfs(sbinfo))
+		buf->f_type = sb->s_magic;
 
-	/* else to mount a mark, must be userns admin */
-	if (!ssi->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
-		goto out;
+	return 0;
+}
 
-	err = kern_path(name, LOOKUP_FOLLOW, &path);
+static void shiftfs_evict_inode(struct inode *inode)
+{
+	struct inode *loweri = inode->i_private;
+
+	clear_inode(inode);
+
+	if (loweri)
+		iput(loweri);
+}
+
+static void shiftfs_put_super(struct super_block *sb)
+{
+	struct shiftfs_super_info *sbinfo = sb->s_fs_info;
+
+	if (sbinfo) {
+		mntput(sbinfo->mnt);
+		put_cred(sbinfo->creator_cred);
+		kfree(sbinfo);
+	}
+}
+
+static const struct xattr_handler shiftfs_xattr_handler = {
+	.prefix = "",
+	.get    = shiftfs_xattr_get,
+	.set    = shiftfs_xattr_set,
+};
+
+const struct xattr_handler *shiftfs_xattr_handlers[] = {
+#ifdef CONFIG_SHIFT_FS_POSIX_ACL
+	&shiftfs_posix_acl_access_xattr_handler,
+	&shiftfs_posix_acl_default_xattr_handler,
+#endif
+	&shiftfs_xattr_handler,
+	NULL
+};
+
+static inline bool passthrough_is_subset(int old_flags, int new_flags)
+{
+	if ((new_flags & old_flags) != new_flags)
+		return false;
+
+	return true;
+}
+
+static int shiftfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	int err;
+	struct shiftfs_super_info new = {};
+	struct shiftfs_super_info *info = sb->s_fs_info;
+
+	err = shiftfs_parse_mount_options(&new, data);
 	if (err)
-		goto out;
+		return err;
+
+	/* Mark mount option cannot be changed. */
+	if (info->mark || (info->mark != new.mark))
+		return -EPERM;
+
+	if (info->passthrough != new.passthrough) {
+		/* Don't allow exceeding passthrough options of mark mount. */
+		if (!passthrough_is_subset(info->info_mark->passthrough,
+					   info->passthrough))
+			return -EPERM;
+
+		info->passthrough = new.passthrough;
+	}
+
+	return 0;
+}
+
+static const struct super_operations shiftfs_super_ops = {
+	.put_super	= shiftfs_put_super,
+	.show_options	= shiftfs_show_options,
+	.statfs		= shiftfs_statfs,
+	.remount_fs	= shiftfs_remount,
+	.evict_inode	= shiftfs_evict_inode,
+};
+
+struct shiftfs_data {
+	void *data;
+	const char *path;
+};
+
+static int shiftfs_fill_super(struct super_block *sb, void *raw_data,
+			      int silent)
+{
+	int err;
+	struct path path = {};
+	struct shiftfs_super_info *sbinfo_mp;
+	char *name = NULL;
+	struct inode *inode = NULL;
+	struct dentry *dentry = NULL;
+	struct shiftfs_data *data = raw_data;
+	struct shiftfs_super_info *sbinfo = NULL;
+
+	if (!data->path)
+		return -EINVAL;
 
-	err = -EPERM;
+	sb->s_fs_info = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+	if (!sb->s_fs_info)
+		return -ENOMEM;
+	sbinfo = sb->s_fs_info;
+
+	err = shiftfs_parse_mount_options(sbinfo, data->data);
+	if (err)
+		return err;
+
+	/* to mount a mark, must be userns admin */
+	if (!sbinfo->mark && !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	name = kstrdup(data->path, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (err)
+		goto out_free_name;
 
 	if (!S_ISDIR(path.dentry->d_inode->i_mode)) {
 		err = -ENOTDIR;
-		goto out_put;
+		goto out_put_path;
 	}
 
-	sb->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
-	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
-		printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
-		err = -EINVAL;
-		goto out_put;
-	}
+	if (sbinfo->mark) {
+		struct super_block *lower_sb = path.mnt->mnt_sb;
+
+		/* to mark a mount point, must root wrt lower s_user_ns */
+		if (!ns_capable(lower_sb->s_user_ns, CAP_SYS_ADMIN)) {
+			err = -EPERM;
+			goto out_put_path;
+		}
 
-	if (ssi->mark) {
 		/*
 		 * this part is visible unshifted, so make sure no
 		 * executables that could be used to give suid
 		 * privileges
 		 */
 		sb->s_iflags = SB_I_NOEXEC;
-		ssi->mnt = path.mnt;
-		dentry = path.dentry;
-	} else {
-		struct shiftfs_super_info *mp_ssi;
 
 		/*
-		 * this leg executes if we're admin capable in
-		 * the namespace, so be very careful
+		 * Handle nesting of shiftfs mounts by referring this mark
+		 * mount back to the original mark mount. This is more
+		 * efficient and alleviates concerns about stack depth.
 		 */
+		if (lower_sb->s_magic == SHIFTFS_MAGIC) {
+			sbinfo_mp = lower_sb->s_fs_info;
+
+			/* Doesn't make sense to mark a mark mount */
+			if (sbinfo_mp->mark) {
+				err = -EINVAL;
+				goto out_put_path;
+			}
+
+			if (!passthrough_is_subset(sbinfo_mp->passthrough,
+						   sbinfo->passthrough)) {
+				err = -EPERM;
+				goto out_put_path;
+			}
+
+			sbinfo->mnt = mntget(sbinfo_mp->mnt);
+			dentry = dget(path.dentry->d_fsdata);
+		} else {
+			sbinfo->mnt = mntget(path.mnt);
+			dentry = dget(path.dentry);
+		}
+
+		sbinfo->creator_cred = prepare_creds();
+		if (!sbinfo->creator_cred) {
+			err = -ENOMEM;
+			goto out_put_path;
+		}
+	} else {
+		/*
+		 * This leg executes if we're admin capable in the namespace,
+		 * so be very careful.
+		 */
+		err = -EPERM;
 		if (path.dentry->d_sb->s_magic != SHIFTFS_MAGIC)
-			goto out_put;
-		mp_ssi = path.dentry->d_sb->s_fs_info;
-		if (!mp_ssi->mark)
-			goto out_put;
-		ssi->mnt = mntget(mp_ssi->mnt);
+			goto out_put_path;
+
+		sbinfo_mp = path.dentry->d_sb->s_fs_info;
+		if (!sbinfo_mp->mark)
+			goto out_put_path;
+
+		if (!passthrough_is_subset(sbinfo_mp->passthrough,
+					   sbinfo->passthrough))
+			goto out_put_path;
+
+		sbinfo->mnt = mntget(sbinfo_mp->mnt);
+		sbinfo->creator_cred = get_cred(sbinfo_mp->creator_cred);
 		dentry = dget(path.dentry->d_fsdata);
-		path_put(&path);
+		sbinfo->info_mark = sbinfo_mp;
+	}
+
+	sb->s_stack_depth = dentry->d_sb->s_stack_depth + 1;
+	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+		printk(KERN_ERR "shiftfs: maximum stacking depth exceeded\n");
+		err = -EINVAL;
+		goto out_put_path;
+	}
+
+	inode = new_inode(sb);
+	if (!inode) {
+		err = -ENOMEM;
+		goto out_put_path;
 	}
-	ssi->userns = get_user_ns(dentry->d_sb->s_user_ns);
-	sb->s_fs_info = ssi;
+	shiftfs_fill_inode(inode, dentry->d_inode->i_ino, S_IFDIR, 0, dentry);
+
+	ihold(dentry->d_inode);
+	inode->i_private = dentry->d_inode;
+
 	sb->s_magic = SHIFTFS_MAGIC;
 	sb->s_op = &shiftfs_super_ops;
 	sb->s_xattr = shiftfs_xattr_handlers;
 	sb->s_d_op = &shiftfs_dentry_ops;
-	sb->s_root = d_make_root(shiftfs_new_inode(sb, S_IFDIR, dentry));
+	sb->s_flags |= SB_POSIXACL;
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_put_path;
+	}
+
 	sb->s_root->d_fsdata = dentry;
+	sbinfo->userns = get_user_ns(dentry->d_sb->s_user_ns);
+	shiftfs_copyattr(dentry->d_inode, sb->s_root->d_inode);
 
-	return 0;
+	dentry = NULL;
+	err = 0;
 
- out_put:
+out_put_path:
 	path_put(&path);
- out:
+
+out_free_name:
 	kfree(name);
-	kfree(ssi);
+
+	dput(dentry);
+
 	return err;
 }
 
@@ -764,17 +1864,26 @@ static struct file_system_type shiftfs_type = {
 
 static int __init shiftfs_init(void)
 {
+	shiftfs_file_info_cache = kmem_cache_create(
+		"shiftfs_file_info_cache", sizeof(struct shiftfs_file_info), 0,
+		SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!shiftfs_file_info_cache)
+		return -ENOMEM;
+
 	return register_filesystem(&shiftfs_type);
 }
 
 static void __exit shiftfs_exit(void)
 {
 	unregister_filesystem(&shiftfs_type);
+	kmem_cache_destroy(shiftfs_file_info_cache);
 }
 
 MODULE_ALIAS_FS("shiftfs");
 MODULE_AUTHOR("James Bottomley");
-MODULE_DESCRIPTION("uid/gid shifting bind filesystem");
+MODULE_AUTHOR("Seth Forshee <seth.forshee at canonical.com>");
+MODULE_AUTHOR("Christian Brauner <christian.brauner at ubuntu.com>");
+MODULE_DESCRIPTION("id shifting filesystem");
 MODULE_LICENSE("GPL v2");
 module_init(shiftfs_init)
 module_exit(shiftfs_exit)
-- 
2.20.1




More information about the kernel-team mailing list