APPLIED[F]: [SRU][F][PATCH 1/1] ceph: fix inode number handling on arches with 32-bit ino_t

Ian May ian.may at canonical.com
Thu Oct 22 18:15:15 UTC 2020


Applied to Focal/master-next

Thanks,
Ian

On 2020-10-14 21:12:01 , frank.heimes at canonical.com wrote:
> From: Tuan Hoang <tmhoang at linux.ibm.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1899582
> 
> Upstream commit : ebce3eb2f7ef9f6ef01a60874ebd232450107c9a
> 
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
> 
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
> 
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
> 
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr and readdir.
> 
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
> 
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
> 
> NOTE: This is a user-visible change on 32-bit arches:
> 
> 1/ inode numbers will be seen to have changed between kernel versions.
>    32-bit arches will see large inode numbers now instead of the hashed
>    ones they saw before.
> 
> 2/ any really old software not built with LFS support may start failing
>    stat() calls with -EOVERFLOW on inode numbers >2^32. Nothing much we
>    can do about these, but hopefully the intersection of people running
>    such code on ceph will be very small.
> 
> The workaround for both problems is to mount with "-o ino32".
> 
> [ idryomov: changelog tweak ]
> 
> URL: https://tracker.ceph.com/issues/46828
> Reported-by: Ulrich Weigand <Ulrich.Weigand at de.ibm.com>
> Reported-and-Tested-by: Tuan Hoang1 <Tuan.Hoang1 at ibm.com>
> Signed-off-by: Jeff Layton <jlayton at kernel.org>
> Reviewed-by: "Yan, Zheng" <zyan at redhat.com>
> Signed-off-by: Ilya Dryomov <idryomov at gmail.com>
> (backported from commit ebce3eb2f7ef9f6ef01a60874ebd232450107c9a)
> Signed-off-by: Frank Heimes <frank.heimes at canonical.com>
> ---
>  fs/ceph/caps.c    | 12 ++++----
>  fs/ceph/debugfs.c |  2 +-
>  fs/ceph/dir.c     | 29 ++++++++-----------
>  fs/ceph/inode.c   | 21 +++++++-------
>  fs/ceph/quota.c   |  4 +--
>  fs/ceph/super.h   | 73 ++++++++++++++++++++++++-----------------------
>  6 files changed, 69 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index b2695919435e..785e0ebe9890 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -869,8 +869,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  	int have = ci->i_snap_caps;
>  
>  	if ((have & mask) == mask) {
> -		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -		     " (mask %s)\n", ci->vfs_inode.i_ino,
> +		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +		     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>  		     ceph_cap_string(have),
>  		     ceph_cap_string(mask));
>  		return 1;
> @@ -881,8 +881,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		if (!__cap_is_valid(cap))
>  			continue;
>  		if ((cap->issued & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch)
> @@ -893,8 +893,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		/* does a combination of caps satisfy mask? */
>  		have |= cap->issued;
>  		if ((have & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino,
> +			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch) {
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index facb387c2735..712b43322547 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -128,7 +128,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>  	struct seq_file *s = p;
>  
> -	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
>  		   ceph_cap_string(cap->issued),
>  		   ceph_cap_string(cap->implemented));
>  	return 0;
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 3367a8194f24..d8c6068429fb 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -254,9 +254,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>  			     dentry, dentry, d_inode(dentry));
>  			ctx->pos = di->offset;
>  			if (!dir_emit(ctx, dentry->d_name.name,
> -				      dentry->d_name.len,
> -				      ceph_translate_ino(dentry->d_sb,
> -							 d_inode(dentry)->i_ino),
> +				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>  				      d_inode(dentry)->i_mode >> 12)) {
>  				dput(dentry);
>  				err = 0;
> @@ -319,18 +317,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	/* always start with . and .. */
>  	if (ctx->pos == 0) {
>  		dout("readdir off 0 -> '.'\n");
> -		if (!dir_emit(ctx, ".", 1, 
> -			    ceph_translate_ino(inode->i_sb, inode->i_ino),
> +		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>  			    inode->i_mode >> 12))
>  			return 0;
>  		ctx->pos = 1;
>  	}
>  	if (ctx->pos == 1) {
> -		ino_t ino = parent_ino(file->f_path.dentry);
> +		u64 ino;
> +		struct dentry *dentry = file->f_path.dentry;
> +
> +		spin_lock(&dentry->d_lock);
> +		ino = ceph_present_inode(dentry->d_parent->d_inode);
> +		spin_unlock(&dentry->d_lock);
> +
>  		dout("readdir off 1 -> '..'\n");
> -		if (!dir_emit(ctx, "..", 2,
> -			    ceph_translate_ino(inode->i_sb, ino),
> -			    inode->i_mode >> 12))
> +		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
>  			return 0;
>  		ctx->pos = 2;
>  	}
> @@ -498,9 +499,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	}
>  	for (; i < rinfo->dir_nr; i++) {
>  		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -		struct ceph_vino vino;
> -		ino_t ino;
> -		u32 ftype;
>  
>  		BUG_ON(rde->offset < ctx->pos);
>  
> @@ -510,13 +508,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  		     rde->name_len, rde->name, &rde->inode.in);
>  
>  		BUG_ON(!rde->inode.in);
> -		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -		vino.ino = le64_to_cpu(rde->inode.in->ino);
> -		vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -		ino = ceph_vino_to_ino(vino);
>  
>  		if (!dir_emit(ctx, rde->name, rde->name_len,
> -			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
>  			dout("filldir stopping us...\n");
>  			return 0;
>  		}
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index c07407586ce8..7164bc8f9115 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +	struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +	ci->i_vino = *(struct ceph_vino *)data;
> +	inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
>  	inode_set_iversion_raw(inode, 0);
>  	return 0;
>  }
> @@ -50,19 +52,16 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>  	struct inode *inode;
> -	ino_t t = ceph_vino_to_ino(vino);
>  
> -	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +			     ceph_set_ino_cb, &vino);
>  	if (!inode)
>  		return ERR_PTR(-ENOMEM);
> -	if (inode->i_state & I_NEW) {
> -		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -		     inode, ceph_vinop(inode), (u64)inode->i_ino);
> +	if (inode->i_state & I_NEW)
>  		unlock_new_inode(inode);
> -	}
>  
> -	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -	     vino.snap, inode);
> +	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
> +	     ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
>  	return inode;
>  }
>  
> @@ -2343,7 +2342,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>  	}
>  
>  	generic_fillattr(inode, stat);
> -	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +	stat->ino = ceph_present_inode(inode);
>  
>  	/*
>  	 * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 19507e2fdb57..bda26b4f9b97 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct super_block *sb = mdsc->fsc->sb;
> +	struct inode *root = d_inode(sb->s_root);
>  
>  	if (atomic64_read(&mdsc->quotarealms_count) > 0)
>  		return true;
>  	/* if root is the real CephFS root, we don't have quota realms */
> -	if (sb->s_root->d_inode &&
> -	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +	if (root && ceph_ino(root) == CEPH_INO_ROOT)
>  		return false;
>  	/* otherwise, we can't know for sure */
>  	return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index bb12c9f3a218..fa7cd81cf232 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -427,15 +427,7 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
>  	return ceph_inode(inode)->i_vino;
>  }
>  
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>  	u32 ino = vino & 0xffffffff;
>  	ino ^= vino >> 32;
> @@ -445,34 +437,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>  }
>  
>  /*
> - * kernel i_ino value
> + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
> + * some arches. We generally do not use this value inside the ceph driver, but
> + * we do want to set it to something, so that generic vfs code has an
> + * appropriate value for tracepoints and the like.
>   */
> -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -	return ceph_ino_to_ino32(vino.ino);
> -#else
> +	if (sizeof(ino_t) == sizeof(u32))
> +		return ceph_ino_to_ino32(vino.ino);
>  	return (ino_t)vino.ino;
> -#endif
> -}
> -
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -		ino = ceph_ino_to_ino32(ino);
> -	return ino;
>  }
> -#endif
> -
>  
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> @@ -481,11 +456,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.snap;
>  }
>  
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +		return ceph_ino_to_ino32(ino);
> +	return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>  	struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -494,11 +492,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>  		ci->i_vino.snap == pvino->snap;
>  }
>  
> +
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>  					    struct ceph_vino vino)
>  {
> -	ino_t t = ceph_vino_to_ino(vino);
> -	return ilookup5(sb, t, ceph_ino_compare, &vino);
> +	/*
> +	 * NB: The hashval will be run through the fs/inode.c hash function
> +	 * anyway, so there is no need to squash the inode number down to
> +	 * 32-bits first. Just use low-order bits on arches with 32-bit long.
> +	 */
> +	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
>  }
>  
>  
> -- 
> 2.25.1
> 
> 
> -- 
> kernel-team mailing list
> kernel-team at lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team



More information about the kernel-team mailing list