[stable] [PATCH 00/16]: Fix writeback regressions in 2.6.32

Jan Kara jack at suse.cz
Mon Aug 16 15:27:24 UTC 2010


On Mon 16-08-10 14:32:55, Stefan Bader wrote:
> On 08/12/2010 05:28 AM, Jens Axboe wrote:
> > On 08/11/2010 04:12 PM, Jens Axboe wrote:
> >> On 08/11/2010 03:59 PM, Greg KH wrote:
> >>> On Tue, Aug 10, 2010 at 11:28:48AM +0200, Stefan Bader wrote:
> >>>> Impact: 2.6.32 is facing some serious performance regression which are visible
> >>>> plainly when unmounting filesystems as async writeback degrades to sync in
> >>>> that case. But there seem to be other less visible problems as well.
> >>>> I believe this set of patches also would be benefitial to go into upstream
> >>>> stable.
> >>>>
> >>>> Fix: This series of patches was picked and backported when needed to apply
> >>>> to 2.6.32.y (there is another set for 2.6.34.y[1]). It probably does not
> >>>> solve all currently known problems but testing has shown considerable
> >>>> improvements. At the beginning of the set there are some patches that were
> >>>> picked to make the later patches applicable. Only one of them is a bit more
> >>>> intrusive the others rather trivial.
> >>>>
> >>>> Testcase: Unmounting a fs that still has larger amounts of data to write back
> >>>> to disk will take noticably longer time to complete (IIRC might be 30min).
> >>>> Also general responsiveness was reported to improve. Currently there are no
> >>>> known regressions reported when testing this backport.
> >>>
> >>> Jens, what do you think about this series?
> >>
> >> I think it is scary :-)
> >>
> >> But seriously, we need to do something about this umount regression and
> >> there's likely no way around this series. Doing the full backport is
> >> the best option. Stefan, to what extent did you test this backport?
> > 
> > So I discussed this with Jan tonight, and a better solution may be
> > something similar to my first attempt at fixing this patch earlier.
> > Basically just pass in the info on whether this is done from umount or
> > not. The bug here is that umount already holds the sb sem when doing
> > WB_SYNC_NONE writeback, so we skip all dirty inodes for that pass and
> > end up doing everything as WB_SYNC_ALL. If we pass in this lock
> > information, then wb_writeback() knows when to skip pinning the sb.
> > 
> > The below patch is totally untested. Stefan, care to give it a spin with
> > the test case? This would be a lot less intrusive than the full
> > backport, which is primarily backporting cleanups and optimizations that
> > we really need not put in 2.6.32-stable if at all avoidable.
> > 
> While applying the patch for testing I saw that stable 2.6.32 carries
> 
> commit b78a38dca6e04634ddc718e315712b45abcf92fd
> Author: Eric Sandeen <sandeen at redhat.com>
> Date:   Wed Dec 23 07:57:07 2009 -0500
> 
>     fs-writeback: Add helper function to start writeback if idle
> 
>     commit 17bd55d037a02b04d9119511cfd1a4b985d20f63 upstream.
> 
> which needs an additional change in writeback_inodes_sb_if_idle(). Making an
> educated guess I changed that call into the "not locked" variant (patch
> attached). Is that correct?
  Yes, that is correct.

> From e8f179b9dc92fe91d7379a3ac330a57a2ef9fc87 Mon Sep 17 00:00:00 2001
> From: Jens Axboe <axboe at kernel.dk>
> Date: Fri, 13 Aug 2010 14:45:16 +0200
> Subject: [PATCH] writeback: Alternate fix for umount regression
> 
> Signed-off-by: Stefan Bader <stefan.bader at canonical.com>
  The patch looks OK to me. You can add
Acked-by: Jan Kara <jack at suse.cz>

								Honza

> ---
>  fs/fs-writeback.c           |   18 +++++++++++-------
>  fs/sync.c                   |    2 +-
>  fs/ubifs/budget.c           |    2 +-
>  include/linux/backing-dev.h |    2 +-
>  include/linux/writeback.h   |    4 +++-
>  mm/page-writeback.c         |    2 +-
>  6 files changed, 18 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 460e5b7..bc87976 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -45,6 +45,7 @@ struct wb_writeback_args {
>  	int for_kupdate:1;
>  	int range_cyclic:1;
>  	int for_background:1;
> +	int locked:1;
>  };
>  
>  /*
> @@ -252,13 +253,14 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
>   *
>   */
>  void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
> -			 long nr_pages)
> +			 long nr_pages, int locked)
>  {
>  	struct wb_writeback_args args = {
>  		.sb		= sb,
>  		.sync_mode	= WB_SYNC_NONE,
>  		.nr_pages	= nr_pages,
>  		.range_cyclic	= 1,
> +		.locked		= locked,
>  	};
>  
>  	/*
> @@ -585,7 +587,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
>  	/*
>  	 * Caller must already hold the ref for this
>  	 */
> -	if (wbc->sync_mode == WB_SYNC_ALL) {
> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->locked) {
>  		WARN_ON(!rwsem_is_locked(&sb->s_umount));
>  		return 0;
>  	}
> @@ -758,6 +760,7 @@ static long wb_writeback(struct bdi_writeback *wb,
>  		.older_than_this	= NULL,
>  		.for_kupdate		= args->for_kupdate,
>  		.range_cyclic		= args->range_cyclic,
> +		.locked			= args->locked,
>  	};
>  	unsigned long oldest_jif;
>  	long wrote = 0;
> @@ -912,7 +915,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
>  		 * If this isn't a data integrity operation, just notify
>  		 * that we have seen this work and we are now starting it.
>  		 */
> -		if (args.sync_mode == WB_SYNC_NONE)
> +		if (args.sync_mode == WB_SYNC_NONE && !args.locked)
>  			wb_clear_pending(wb, work);
>  
>  		wrote += wb_writeback(wb, &args);
> @@ -921,7 +924,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
>  		 * This is a data integrity writeback, so only do the
>  		 * notification when we have completed the work.
>  		 */
> -		if (args.sync_mode == WB_SYNC_ALL)
> +		if (args.sync_mode == WB_SYNC_ALL || args.locked)
>  			wb_clear_pending(wb, work);
>  	}
>  
> @@ -1206,13 +1209,14 @@ static void wait_sb_inodes(struct super_block *sb)
>  /**
>   * writeback_inodes_sb	-	writeback dirty inodes from given super_block
>   * @sb: the superblock
> + * @locked: sb already pinned
>   *
>   * Start writeback on some inodes on this super_block. No guarantees are made
>   * on how many (if any) will be written, and this function does not wait
>   * for IO completion of submitted IO. The number of pages submitted is
>   * returned.
>   */
> -void writeback_inodes_sb(struct super_block *sb)
> +void writeback_inodes_sb(struct super_block *sb, int locked)
>  {
>  	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
>  	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
> @@ -1221,7 +1225,7 @@ void writeback_inodes_sb(struct super_block *sb)
>  	nr_to_write = nr_dirty + nr_unstable +
>  			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
>  
> -	bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
> +	bdi_start_writeback(sb->s_bdi, sb, nr_to_write, locked);
>  }
>  EXPORT_SYMBOL(writeback_inodes_sb);
>  
> @@ -1235,7 +1239,7 @@ EXPORT_SYMBOL(writeback_inodes_sb);
>  int writeback_inodes_sb_if_idle(struct super_block *sb)
>  {
>  	if (!writeback_in_progress(sb->s_bdi)) {
> -		writeback_inodes_sb(sb);
> +		writeback_inodes_sb(sb, 0);
>  		return 1;
>  	} else
>  		return 0;
> diff --git a/fs/sync.c b/fs/sync.c
> index d104591..41d543e 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -37,7 +37,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
>  	/* Avoid doing twice syncing and cache pruning for quota sync */
>  	if (!wait) {
>  		writeout_quota_sb(sb, -1);
> -		writeback_inodes_sb(sb);
> +		writeback_inodes_sb(sb, 1);
>  	} else {
>  		sync_quota_sb(sb, -1);
>  		sync_inodes_sb(sb);
> diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
> index 076ca50..d16a8eb 100644
> --- a/fs/ubifs/budget.c
> +++ b/fs/ubifs/budget.c
> @@ -62,7 +62,7 @@
>   */
>  static void shrink_liability(struct ubifs_info *c, int nr_to_write)
>  {
> -	writeback_inodes_sb(c->vfs_sb);
> +	writeback_inodes_sb(c->vfs_sb, 0);
>  }
>  
>  /**
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index b449e73..ce997ba 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -102,7 +102,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
>  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
>  void bdi_unregister(struct backing_dev_info *bdi);
>  void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
> -				long nr_pages);
> +				long nr_pages, int locked);
>  int bdi_writeback_task(struct bdi_writeback *wb);
>  int bdi_has_dirty_io(struct backing_dev_info *bdi);
>  
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index dc52482..4578dc1 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -52,6 +52,8 @@ struct writeback_control {
>  	unsigned for_reclaim:1;		/* Invoked from the page allocator */
>  	unsigned range_cyclic:1;	/* range_start is cyclic */
>  	unsigned more_io:1;		/* more io to be dispatched */
> +	unsigned locked:1;		/* sb pinned for WB_SYNC_NONE */
> +
>  	/*
>  	 * write_cache_pages() won't update wbc->nr_to_write and
>  	 * mapping->writeback_index if no_nrwrite_index_update
> @@ -68,7 +70,7 @@ struct writeback_control {
>   */	
>  struct bdi_writeback;
>  int inode_wait(void *);
> -void writeback_inodes_sb(struct super_block *);
> +void writeback_inodes_sb(struct super_block *, int);
>  int writeback_inodes_sb_if_idle(struct super_block *);
>  void sync_inodes_sb(struct super_block *);
>  void writeback_inodes_wbc(struct writeback_control *wbc);
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 2c5d792..e73dbb7 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
>  	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
>  			       + global_page_state(NR_UNSTABLE_NFS))
>  					  > background_thresh)))
> -		bdi_start_writeback(bdi, NULL, 0);
> +		bdi_start_writeback(bdi, NULL, 0, 0);
>  }
>  
>  void set_page_dirty_balance(struct page *page, int page_mkwrite)
> -- 
> 1.7.0.4
> 

-- 
Jan Kara <jack at suse.cz>
SUSE Labs, CR




More information about the kernel-team mailing list