ACK: [PATCH][SRU][ARTFUL] Revert "mm, memory_hotplug: do not associate hotadded memory to zones until online"

Fri Feb 9 17:14:32 UTC 2018

On 2018-02-09 10:08:54 , Colin King wrote:
> From: Colin Ian King <colin.king at canonical.com>
> 
> BugLink: http://bugs.launchpad.net/bugs/1747069
> 
> Hotplug removal causes i386 crashes when exercised with the kernel
> selftest mem-on-off-test script.  A fix occurs in 4.15 however this
> requires a large set of changes that are way too large to be SRU'able
> and the least risky way forward is to revert the offending commit.
> 
> This fix reverts commit f1dd2cd13c4b ("mm, memory_hotplug: do not
> associate hotadded memory to zones until online"), however the
> revert required some manual fix-ups because of subsequent fixes after
> this commit.
> 
> Note that running the mem-on-off-script is not always sufficient to
> trigger the bug. A good reproducer is to run in a 4 CPU VM with 2GB
> of memory and after running the script run sync and then re-install
> the kernel packages to trip the issue.  This has been thoroughly
> tested on i386 and amd64 and given a solid soak test using the
> ADT tests too.
> 
> Signed-off-by: Colin Ian King <colin.king at canonical.com>
> ---
>  arch/ia64/mm/init.c            |  11 +-
>  arch/powerpc/mm/mem.c          |  12 +-
>  arch/s390/mm/init.c            |  32 ++-
>  arch/sh/mm/init.c              |  10 +-
>  arch/x86/mm/init_32.c          |   7 +-
>  arch/x86/mm/init_64.c          |  11 +-
>  drivers/base/memory.c          |  52 +++--
>  include/linux/memory_hotplug.h |  19 +-
>  include/linux/mmzone.h         |  16 --
>  kernel/memremap.c              |   6 +-
>  mm/memory_hotplug.c            | 457 +++++++++++++++++++++++++++--------------
>  mm/sparse.c                    |   3 +-
>  12 files changed, 409 insertions(+), 227 deletions(-)
> 
> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
> index a4e8d6b..39e2aeb 100644
> --- a/arch/ia64/mm/init.c
> +++ b/arch/ia64/mm/init.c
> @@ -646,13 +646,20 @@ mem_init (void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	pg_data_t *pgdat;
> +	struct zone *zone;
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int ret;
>  
> -	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> +	pgdat = NODE_DATA(nid);
> +
> +	zone = pgdat->node_zones +
> +		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
> +	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> +
>  	if (ret)
>  		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
>  		       __func__,  ret);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 46b4e67..331a78b 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
>  	return -ENODEV;
>  }
>  
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	struct pglist_data *pgdata;
> +	struct zone *zone;
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int rc;
>  
>  	resize_hpt_for_hotplug(memblock_phys_mem_size());
>  
> +	pgdata = NODE_DATA(nid);
> +
>  	start = (unsigned long)__va(start);
>  	rc = create_section_mapping(start, start + size);
>  	if (rc) {
> @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
>  		return -EFAULT;
>  	}
>  
> -	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
> +	/* this should work for most non-highmem platforms */
> +	zone = pgdata->node_zones +
> +		zone_for_memory(nid, start, size, 0, for_device);
> +
> +	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index 8111694..a3d5499 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
>  	unsigned long start_pfn = PFN_DOWN(start);
>  	unsigned long size_pages = PFN_DOWN(size);
> -	int rc;
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +	struct zone *zone;
> +	int rc, i;
>  
>  	rc = vmem_add_mapping(start, size);
>  	if (rc)
>  		return rc;
>  
> -	rc = __add_pages(nid, start_pfn, size_pages, want_memblock);
> +	for (i = 0; i < MAX_NR_ZONES; i++) {
> +		zone = pgdat->node_zones + i;
> +		if (zone_idx(zone) != ZONE_MOVABLE) {
> +			/* Add range within existing zone limits, if possible */
> +			zone_start_pfn = zone->zone_start_pfn;
> +			zone_end_pfn = zone->zone_start_pfn +
> +				       zone->spanned_pages;
> +		} else {
> +			/* Add remaining range to ZONE_MOVABLE */
> +			zone_start_pfn = start_pfn;
> +			zone_end_pfn = start_pfn + size_pages;
> +		}
> +		if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
> +			continue;
> +		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
> +			   zone_end_pfn - start_pfn : size_pages;
> +		rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> +		if (rc)
> +			break;
> +		start_pfn += nr_pages;
> +		size_pages -= nr_pages;
> +		if (!size_pages)
> +			break;
> +	}
>  	if (rc)
>  		vmem_remove_mapping(start, size);
>  	return rc;
> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
> index bf726af..a9d57f7 100644
> --- a/arch/sh/mm/init.c
> +++ b/arch/sh/mm/init.c
> @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end)
>  #endif
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	pg_data_t *pgdat;
>  	unsigned long start_pfn = PFN_DOWN(start);
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int ret;
>  
> +	pgdat = NODE_DATA(nid);
> +
>  	/* We only have ZONE_NORMAL, so this is easy.. */
> -	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> +	ret = __add_pages(nid, pgdat->node_zones +
> +			zone_for_memory(nid, start, size, ZONE_NORMAL,
> +			for_device),
> +			start_pfn, nr_pages, !for_device);
>  	if (unlikely(ret))
>  		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
>  
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 135c9a7..e7bae21 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -829,12 +829,15 @@ void __init mem_init(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	struct pglist_data *pgdata = NODE_DATA(nid);
> +	struct zone *zone = pgdata->node_zones +
> +		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  
> -	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
> +	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 902983c..456b452 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -772,15 +772,22 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
>  	}
>  }
>  
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +/*
> + * Memory is added always to NORMAL zone. This means you will never get
> + * additional DMA/DMA32 memory.
> + */
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
> +	struct pglist_data *pgdat = NODE_DATA(nid);
> +	struct zone *zone = pgdat->node_zones +
> +		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int ret;
>  
>  	init_memory_mapping(start, start + size);
>  
> -	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> +	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
>  	WARN_ON_ONCE(ret);
>  
>  	/* update max_pfn, max_low_pfn and high_memory */
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index c7c4e03..1e884d8 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev,
>  				struct device_attribute *attr, char *buf)
>  {
>  	struct memory_block *mem = to_memory_block(dev);
> -	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
> +	unsigned long start_pfn, end_pfn;
> +	unsigned long valid_start, valid_end, valid_pages;
>  	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
> -	unsigned long valid_start_pfn, valid_end_pfn;
> -	bool append = false;
> -	int nid;
> +	struct zone *zone;
> +	int zone_shift = 0;
>  
> -	/*
> -	 * The block contains more than one zone can not be offlined.
> -	 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
> -	 */
> -	if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
> +	start_pfn = section_nr_to_pfn(mem->start_section_nr);
> +	end_pfn = start_pfn + nr_pages;
> +
> +	/* The block contains more than one zone can not be offlined. */
> +	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
>  		return sprintf(buf, "none\n");
>  
> -	start_pfn = valid_start_pfn;
> -	nr_pages = valid_end_pfn - start_pfn;
> +	zone = page_zone(pfn_to_page(valid_start));
> +	valid_pages = valid_end - valid_start;
>  
> -	/*
> -	 * Check the existing zone. Make sure that we do that only on the
> -	 * online nodes otherwise the page_zone is not reliable
> -	 */
> -	if (mem->state == MEM_ONLINE) {
> -		strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
> -		goto out;
> -	}
> +	/* MMOP_ONLINE_KEEP */
> +	sprintf(buf, "%s", zone->name);
>  
> -	nid = pfn_to_nid(start_pfn);
> -	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
> -		strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
> -		append = true;
> +	/* MMOP_ONLINE_KERNEL */
> +	zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift);
> +	if (zone_shift) {
> +		strcat(buf, " ");
> +		strcat(buf, (zone + zone_shift)->name);
>  	}
>  
> -	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
> -		if (append)
> -			strcat(buf, " ");
> -		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
> +	/* MMOP_ONLINE_MOVABLE */
> +	zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift);
> +	if (zone_shift) {
> +		strcat(buf, " ");
> +		strcat(buf, (zone + zone_shift)->name);
>  	}
> -out:
> +
>  	strcat(buf, "\n");
>  
>  	return strlen(buf);
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index c8a5056..b18a1b8 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
>  	unsigned long nr_pages);
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
> -/* reasonably generic interface to expand the physical pages */
> -extern int __add_pages(int nid, unsigned long start_pfn,
> +/* reasonably generic interface to expand the physical pages in a zone  */
> +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
>  	unsigned long nr_pages, bool want_memblock);
>  
>  #ifdef CONFIG_NUMA
> @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
>  		void *arg, int (*func)(struct memory_block *, void *));
>  extern int add_memory(int nid, u64 start, u64 size);
>  extern int add_memory_resource(int nid, struct resource *resource, bool online);
> -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock);
> -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
> -		unsigned long nr_pages);
> +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
> +		bool for_device);
> +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
>  extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
>  extern bool is_memblock_offlined(struct memory_block *mem);
>  extern void remove_memory(int nid, u64 start, u64 size);
> -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn);
> +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
>  extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
>  		unsigned long map_offset);
>  extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
>  					  unsigned long pnum);
> -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
> -		int online_type);
> -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
> -		unsigned long nr_pages);
> +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
> +			  enum zone_type target, int *zone_shift);
> +
>  #endif /* __LINUX_MEMORY_HOTPLUG_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9c6c001..ceb86e9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone)
>  }
>  
>  /*
> - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
> - * intersection with the given zone
> - */
> -static inline bool zone_intersects(struct zone *zone,
> -		unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	if (zone_is_empty(zone))
> -		return false;
> -	if (start_pfn >= zone_end_pfn(zone) ||
> -	    start_pfn + nr_pages <= zone->zone_start_pfn)
> -		return false;
> -
> -	return true;
> -}
> -
> -/*
>   * The "priority" of VM scanning is how much of the queues we will scan in one
>   * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
>   * queues ("queue_length >> 12") during an aging round.
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 124bed7..23a6483 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
>  		goto err_pfn_remap;
>  
>  	mem_hotplug_begin();
> -	error = arch_add_memory(nid, align_start, align_size, false);
> -	if (!error)
> -		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> -					align_start >> PAGE_SHIFT,
> -					align_size >> PAGE_SHIFT);
> +	error = arch_add_memory(nid, align_start, align_size, true);
>  	mem_hotplug_done();
>  	if (error)
>  		goto err_add_memory;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 8dccc31..a4724e7 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
>  }
>  #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
>  
> -static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
> -		bool want_memblock)
> +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
> +				     unsigned long end_pfn)
> +{
> +	unsigned long old_zone_end_pfn;
> +
> +	zone_span_writelock(zone);
> +
> +	old_zone_end_pfn = zone_end_pfn(zone);
> +	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
> +		zone->zone_start_pfn = start_pfn;
> +
> +	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
> +				zone->zone_start_pfn;
> +
> +	zone_span_writeunlock(zone);
> +}
> +
> +static void resize_zone(struct zone *zone, unsigned long start_pfn,
> +		unsigned long end_pfn)
> +{
> +	zone_span_writelock(zone);
> +
> +	if (end_pfn - start_pfn) {
> +		zone->zone_start_pfn = start_pfn;
> +		zone->spanned_pages = end_pfn - start_pfn;
> +	} else {
> +		/*
> +		 * make it consist as free_area_init_core(),
> +		 * if spanned_pages = 0, then keep start_pfn = 0
> +		 */
> +		zone->zone_start_pfn = 0;
> +		zone->spanned_pages = 0;
> +	}
> +
> +	zone_span_writeunlock(zone);
> +}
> +
> +static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
> +		unsigned long end_pfn)
> +{
> +	enum zone_type zid = zone_idx(zone);
> +	int nid = zone->zone_pgdat->node_id;
> +	unsigned long pfn;
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn++)
> +		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
> +}
> +
> +static void __ref ensure_zone_is_initialized(struct zone *zone,
> +			unsigned long start_pfn, unsigned long num_pages)
> +{
> +	if (!zone_is_initialized(zone))
> +		init_currently_empty_zone(zone, start_pfn, num_pages);
> +}
> +
> +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
> +		unsigned long start_pfn, unsigned long end_pfn)
> +{
> +	unsigned long flags;
> +	unsigned long z1_start_pfn;
> +
> +	ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
> +
> +	pgdat_resize_lock(z1->zone_pgdat, &flags);
> +
> +	/* can't move pfns which are higher than @z2 */
> +	if (end_pfn > zone_end_pfn(z2))
> +		goto out_fail;
> +	/* the move out part must be at the left most of @z2 */
> +	if (start_pfn > z2->zone_start_pfn)
> +		goto out_fail;
> +	/* must included/overlap */
> +	if (end_pfn <= z2->zone_start_pfn)
> +		goto out_fail;
> +
> +	/* use start_pfn for z1's start_pfn if z1 is empty */
> +	if (!zone_is_empty(z1))
> +		z1_start_pfn = z1->zone_start_pfn;
> +	else
> +		z1_start_pfn = start_pfn;
> +
> +	resize_zone(z1, z1_start_pfn, end_pfn);
> +	resize_zone(z2, end_pfn, zone_end_pfn(z2));
> +
> +	pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +
> +	fix_zone_id(z1, start_pfn, end_pfn);
> +
> +	return 0;
> +out_fail:
> +	pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +	return -1;
> +}
> +
> +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
> +		unsigned long start_pfn, unsigned long end_pfn)
> +{
> +	unsigned long flags;
> +	unsigned long z2_end_pfn;
> +
> +	ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
> +
> +	pgdat_resize_lock(z1->zone_pgdat, &flags);
> +
> +	/* can't move pfns which are lower than @z1 */
> +	if (z1->zone_start_pfn > start_pfn)
> +		goto out_fail;
> +	/* the move out part mast at the right most of @z1 */
> +	if (zone_end_pfn(z1) >  end_pfn)
> +		goto out_fail;
> +	/* must included/overlap */
> +	if (start_pfn >= zone_end_pfn(z1))
> +		goto out_fail;
> +
> +	/* use end_pfn for z2's end_pfn if z2 is empty */
> +	if (!zone_is_empty(z2))
> +		z2_end_pfn = zone_end_pfn(z2);
> +	else
> +		z2_end_pfn = end_pfn;
> +
> +	resize_zone(z1, z1->zone_start_pfn, start_pfn);
> +	resize_zone(z2, start_pfn, z2_end_pfn);
> +
> +	pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +
> +	fix_zone_id(z2, start_pfn, end_pfn);
> +
> +	return 0;
> +out_fail:
> +	pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +	return -1;
> +}
> +
> +static struct zone * __meminit move_pfn_range(int zone_shift,
> +		unsigned long start_pfn, unsigned long end_pfn)
> +{
> +	struct zone *zone = page_zone(pfn_to_page(start_pfn));
> +	int ret = 0;
> +
> +	if (zone_shift < 0)
> +		ret = move_pfn_range_left(zone + zone_shift, zone,
> +					  start_pfn, end_pfn);
> +	else if (zone_shift)
> +		ret = move_pfn_range_right(zone, zone + zone_shift,
> +					   start_pfn, end_pfn);
> +
> +	if (ret)
> +		return NULL;
> +
> +	return zone + zone_shift;
> +}
> +
> +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
> +				      unsigned long end_pfn)
> +{
> +	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
> +
> +	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
> +		pgdat->node_start_pfn = start_pfn;
> +
> +	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
> +					pgdat->node_start_pfn;
> +}
> +
> +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
> +{
> +	struct pglist_data *pgdat = zone->zone_pgdat;
> +	int nr_pages = PAGES_PER_SECTION;
> +	int nid = pgdat->node_id;
> +	int zone_type;
> +	unsigned long flags, pfn;
> +
> +	zone_type = zone - pgdat->node_zones;
> +	ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
> +
> +	pgdat_resize_lock(zone->zone_pgdat, &flags);
> +	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
> +	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
> +			phys_start_pfn + nr_pages);
> +	pgdat_resize_unlock(zone->zone_pgdat, &flags);
> +	memmap_init_zone(nr_pages, nid, zone_type,
> +			 phys_start_pfn, MEMMAP_HOTPLUG);
> +
> +	/* online_page_range is called later and expects pages reserved */
> +	for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
> +		if (!pfn_valid(pfn))
> +			continue;
> +
> +		SetPageReserved(pfn_to_page(pfn));
> +	}
> +	return 0;
> +}
> +
> +static int __meminit __add_section(int nid, struct zone *zone,
> +		unsigned long phys_start_pfn, bool want_memblock)
>  {
>  	int ret;
> -	int i;
>  
>  	if (pfn_valid(phys_start_pfn))
>  		return -EEXIST;
>  
> -	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
> +	ret = sparse_add_one_section(zone, phys_start_pfn);
> +
>  	if (ret < 0)
>  		return ret;
>  
> -	/*
> -	 * Make all the pages reserved so that nobody will stumble over half
> -	 * initialized state.
> -	 * FIXME: We also have to associate it with a node because pfn_to_node
> -	 * relies on having page with the proper node.
> -	 */
> -	for (i = 0; i < PAGES_PER_SECTION; i++) {
> -		unsigned long pfn = phys_start_pfn + i;
> -		struct page *page;
> -		if (!pfn_valid(pfn))
> -			continue;
> +	ret = __add_zone(zone, phys_start_pfn);
>  
> -		page = pfn_to_page(pfn);
> -		set_page_node(page, nid);
> -		SetPageReserved(page);
> -	}
> +	if (ret < 0)
> +		return ret;
>  
>  	if (!want_memblock)
>  		return 0;
> @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
>   * call this function after deciding the zone to which to
>   * add the new pages.
>   */
> -int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
>  			unsigned long nr_pages, bool want_memblock)
>  {
>  	unsigned long i;
> @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>  	int start_sec, end_sec;
>  	struct vmem_altmap *altmap;
>  
> +	clear_zone_contiguous(zone);
> +
>  	/* during initialize mem_map, align hot-added range to section */
>  	start_sec = pfn_to_section_nr(phys_start_pfn);
>  	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
> @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>  	}
>  
>  	for (i = start_sec; i <= end_sec; i++) {
> -		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
> +		err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock);
>  
>  		/*
>  		 * EEXIST is finally dealt with by ioresource collision
> @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>  	}
>  	vmemmap_populate_print_last();
>  out:
> +	set_zone_contiguous(zone);
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(__add_pages);
> @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_MOVABLE_NODE
> +/*
> + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
> + * normal memory.
> + */
> +static bool can_online_high_movable(int nid)
> +{
> +	return true;
> +}
> +#else /* CONFIG_MOVABLE_NODE */
> +/* ensure every online node has NORMAL memory */
> +static bool can_online_high_movable(int nid)
> +{
> +	return node_state(nid, N_NORMAL_MEMORY);
> +}
> +#endif /* CONFIG_MOVABLE_NODE */
> +
>  /* check which state of node_states will be changed when online memory */
>  static void node_states_check_changes_online(unsigned long nr_pages,
>  	struct zone *zone, struct memory_notify *arg)
> @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
>  	node_set_state(node, N_MEMORY);
>  }
>  
> -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
> +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
> +		   enum zone_type target, int *zone_shift)
>  {
> -	struct pglist_data *pgdat = NODE_DATA(nid);
> -	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
> -	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
> -
> -	/*
> -	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
> -	 * physically before ZONE_MOVABLE. All we need is they do not
> -	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
> -	 * though so let's stick with it for simplicity for now.
> -	 * TODO make sure we do not overlap with ZONE_DEVICE
> -	 */
> -	if (online_type == MMOP_ONLINE_KERNEL) {
> -		if (zone_is_empty(movable_zone))
> -			return true;
> -		return movable_zone->zone_start_pfn >= pfn + nr_pages;
> -	} else if (online_type == MMOP_ONLINE_MOVABLE) {
> -		return zone_end_pfn(default_zone) <= pfn;
> -	}
> -
> -	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
> -	return online_type == MMOP_ONLINE_KEEP;
> -}
> -
> -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
> -		unsigned long nr_pages)
> -{
> -	unsigned long old_end_pfn = zone_end_pfn(zone);
> -
> -	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
> -		zone->zone_start_pfn = start_pfn;
> -
> -	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
> -}
> -
> -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
> -                                     unsigned long nr_pages)
> -{
> -	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
> -
> -	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
> -		pgdat->node_start_pfn = start_pfn;
> -
> -	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
> -}
> -
> -void __ref move_pfn_range_to_zone(struct zone *zone,
> -		unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	struct pglist_data *pgdat = zone->zone_pgdat;
> -	int nid = pgdat->node_id;
> -	unsigned long flags;
> -
> -	if (zone_is_empty(zone))
> -		init_currently_empty_zone(zone, start_pfn, nr_pages);
> -
> -	clear_zone_contiguous(zone);
> -
> -	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
> -	pgdat_resize_lock(pgdat, &flags);
> -	zone_span_writelock(zone);
> -	resize_zone_range(zone, start_pfn, nr_pages);
> -	zone_span_writeunlock(zone);
> -	resize_pgdat_range(pgdat, start_pfn, nr_pages);
> -	pgdat_resize_unlock(pgdat, &flags);
> -
> -	/*
> -	 * TODO now we have a visible range of pages which are not associated
> -	 * with their zone properly. Not nice but set_pfnblock_flags_mask
> -	 * expects the zone spans the pfn range. All the pages in the range
> -	 * are reserved so nobody should be touching them so we should be safe
> -	 */
> -	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
> -
> -	set_zone_contiguous(zone);
> -}
> +	struct zone *zone = page_zone(pfn_to_page(pfn));
> +	enum zone_type idx = zone_idx(zone);
> +	int i;
>  
> -/*
> - * Returns a default kernel memory zone for the given pfn range.
> - * If no kernel zone covers this pfn range it will automatically go
> - * to the ZONE_NORMAL.
> - */
> -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
> -		unsigned long nr_pages)
> -{
> -	struct pglist_data *pgdat = NODE_DATA(nid);
> -	int zid;
> +	*zone_shift = 0;
>  
> -	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
> -		struct zone *zone = &pgdat->node_zones[zid];
> +	if (idx < target) {
> +		/* pages must be at end of current zone */
> +		if (pfn + nr_pages != zone_end_pfn(zone))
> +			return false;
>  
> -		if (zone_intersects(zone, start_pfn, nr_pages))
> -			return zone;
> +		/* no zones in use between current zone and target */
> +		for (i = idx + 1; i < target; i++)
> +			if (zone_is_initialized(zone - idx + i))
> +				return false;
>  	}
>  
> -	return &pgdat->node_zones[ZONE_NORMAL];
> -}
> -
> -static inline bool movable_pfn_range(int nid, struct zone *default_zone,
> -		unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
> -				MMOP_ONLINE_KERNEL))
> -		return true;
> -
> -	if (!movable_node_is_enabled())
> -		return false;
> -
> -	return !zone_intersects(default_zone, start_pfn, nr_pages);
> -}
> -
> -/*
> - * Associates the given pfn range with the given node and the zone appropriate
> - * for the given online type.
> - */
> -static struct zone * __meminit move_pfn_range(int online_type, int nid,
> -		unsigned long start_pfn, unsigned long nr_pages)
> -{
> -	struct pglist_data *pgdat = NODE_DATA(nid);
> -	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
> +	if (target < idx) {
> +		/* pages must be at beginning of current zone */
> +		if (pfn != zone->zone_start_pfn)
> +			return false;
>  
> -	if (online_type == MMOP_ONLINE_KEEP) {
> -		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
> -		/*
> -		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
> -		 * movable zone if that is not possible (e.g. we are within
> -		 * or past the existing movable zone). movable_node overrides
> -		 * this default and defaults to movable zone
> -		 */
> -		if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
> -			zone = movable_zone;
> -	} else if (online_type == MMOP_ONLINE_MOVABLE) {
> -		zone = &pgdat->node_zones[ZONE_MOVABLE];
> +		/* no zones in use between current zone and target */
> +		for (i = target + 1; i < idx; i++)
> +			if (zone_is_initialized(zone - idx + i))
> +				return false;
>  	}
>  
> -	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
> -	return zone;
> +	*zone_shift = target - idx;
> +	return true;
>  }
>  
>  /* Must be protected by mem_hotplug_begin() */
> @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
>  	int nid;
>  	int ret;
>  	struct memory_notify arg;
> +	int zone_shift = 0;
>  
> -	nid = pfn_to_nid(pfn);
> -	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
> +	/*
> +	 * This doesn't need a lock to do pfn_to_page().
> +	 * The section can't be removed here because of the
> +	 * memory_block->state_mutex.
> +	 */
> +	zone = page_zone(pfn_to_page(pfn));
> +
> +	if ((zone_idx(zone) > ZONE_NORMAL ||
> +	    online_type == MMOP_ONLINE_MOVABLE) &&
> +	    !can_online_high_movable(pfn_to_nid(pfn)))
>  		return -EINVAL;
>  
> -	/* associate pfn range with the zone */
> -	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
> +	if (online_type == MMOP_ONLINE_KERNEL) {
> +		if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
> +			return -EINVAL;
> +	} else if (online_type == MMOP_ONLINE_MOVABLE) {
> +		if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
> +			return -EINVAL;
> +	}
> +
> +	zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
> +	if (!zone)
> +		return -EINVAL;
>  
>  	arg.start_pfn = pfn;
>  	arg.nr_pages = nr_pages;
>  	node_states_check_changes_online(nr_pages, zone, &arg);
>  
> +	nid = zone_to_nid(zone);
> +
>  	ret = memory_notify(MEM_GOING_ONLINE, &arg);
>  	ret = notifier_to_errno(ret);
>  	if (ret)
> @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size)
>  	return 0;
>  }
>  
> +/*
> + * If movable zone has already been setup, newly added memory should be check.
> + * If its address is higher than movable zone, it should be added as movable.
> + * Without this check, movable zone may overlap with other zone.
> + */
> +static int should_add_memory_movable(int nid, u64 start, u64 size)
> +{
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +	struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
> +
> +	if (zone_is_empty(movable_zone))
> +		return 0;
> +
> +	if (movable_zone->zone_start_pfn <= start_pfn)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
> +		bool for_device)
> +{
> +#ifdef CONFIG_ZONE_DEVICE
> +	if (for_device)
> +		return ZONE_DEVICE;
> +#endif
> +	if (should_add_memory_movable(nid, start, size))
> +		return ZONE_MOVABLE;
> +
> +	return zone_default;
> +}
> +
>  static int online_memory_block(struct memory_block *mem, void *arg)
>  {
>  	return device_online(&mem->dev);
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 9c48e4f..293cb5d 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap)
>   * set.  If this is <=0, then that means that the passed-in
>   * map was not consumed and must be freed.
>   */
> -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
> +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
>  {
>  	unsigned long section_nr = pfn_to_section_nr(start_pfn);
> +	struct pglist_data *pgdat = zone->zone_pgdat;
>  	struct mem_section *ms;
>  	struct page *memmap;
>  	unsigned long *usemap;

I don't really get this code, but it's a revert and has been thoroughly tested and discussed, so:

Acked-by: Khalid Elmously <khalid.elmously at canonical.com>