ACK: [PATCH][SRU][ARTFUL] Revert "mm, memory_hotplug: do not associate hotadded memory to zones until online"
Khaled Elmously
khalid.elmously at canonical.com
Fri Feb 9 17:14:32 UTC 2018
On 2018-02-09 10:08:54 , Colin King wrote:
> From: Colin Ian King <colin.king at canonical.com>
>
> BugLink: http://bugs.launchpad.net/bugs/1747069
>
> Hotplug removal causes i386 crashes when exercised with the kernel
> selftest mem-on-off-test script. A fix occurs in 4.15 however this
> requires a large set of changes that are way too large to be SRU'able
> and the least risky way forward is to revert the offending commit.
>
> This fix reverts commit f1dd2cd13c4b ("mm, memory_hotplug: do not
> associate hotadded memory to zones until online"), however the
> revert required some manual fix-ups because of subsequent fixes after
> this commit.
>
> Note that running the mem-on-off-script is not always sufficient to
> trigger the bug. A good reproducer is to run in a 4 CPU VM with 2GB
> of memory and after running the script run sync and then re-install
> the kernel packages to trip the issue. This has been thoroughly
> tested on i386 and amd64 and given a solid soak test using the
> ADT tests too.
>
> Signed-off-by: Colin Ian King <colin.king at canonical.com>
> ---
> arch/ia64/mm/init.c | 11 +-
> arch/powerpc/mm/mem.c | 12 +-
> arch/s390/mm/init.c | 32 ++-
> arch/sh/mm/init.c | 10 +-
> arch/x86/mm/init_32.c | 7 +-
> arch/x86/mm/init_64.c | 11 +-
> drivers/base/memory.c | 52 +++--
> include/linux/memory_hotplug.h | 19 +-
> include/linux/mmzone.h | 16 --
> kernel/memremap.c | 6 +-
> mm/memory_hotplug.c | 457 +++++++++++++++++++++++++++--------------
> mm/sparse.c | 3 +-
> 12 files changed, 409 insertions(+), 227 deletions(-)
>
> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
> index a4e8d6b..39e2aeb 100644
> --- a/arch/ia64/mm/init.c
> +++ b/arch/ia64/mm/init.c
> @@ -646,13 +646,20 @@ mem_init (void)
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + pg_data_t *pgdat;
> + struct zone *zone;
> unsigned long start_pfn = start >> PAGE_SHIFT;
> unsigned long nr_pages = size >> PAGE_SHIFT;
> int ret;
>
> - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> + pgdat = NODE_DATA(nid);
> +
> + zone = pgdat->node_zones +
> + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
> + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> +
> if (ret)
> printk("%s: Problem encountered in __add_pages() as ret=%d\n",
> __func__, ret);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 46b4e67..331a78b 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -127,14 +127,18 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
> return -ENODEV;
> }
>
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + struct pglist_data *pgdata;
> + struct zone *zone;
> unsigned long start_pfn = start >> PAGE_SHIFT;
> unsigned long nr_pages = size >> PAGE_SHIFT;
> int rc;
>
> resize_hpt_for_hotplug(memblock_phys_mem_size());
>
> + pgdata = NODE_DATA(nid);
> +
> start = (unsigned long)__va(start);
> rc = create_section_mapping(start, start + size);
> if (rc) {
> @@ -144,7 +148,11 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> return -EFAULT;
> }
>
> - return __add_pages(nid, start_pfn, nr_pages, want_memblock);
> + /* this should work for most non-highmem platforms */
> + zone = pgdata->node_zones +
> + zone_for_memory(nid, start, size, 0, for_device);
> +
> + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> }
>
> #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index 8111694..a3d5499 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -166,17 +166,43 @@ unsigned long memory_block_size_bytes(void)
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
> unsigned long start_pfn = PFN_DOWN(start);
> unsigned long size_pages = PFN_DOWN(size);
> - int rc;
> + pg_data_t *pgdat = NODE_DATA(nid);
> + struct zone *zone;
> + int rc, i;
>
> rc = vmem_add_mapping(start, size);
> if (rc)
> return rc;
>
> - rc = __add_pages(nid, start_pfn, size_pages, want_memblock);
> + for (i = 0; i < MAX_NR_ZONES; i++) {
> + zone = pgdat->node_zones + i;
> + if (zone_idx(zone) != ZONE_MOVABLE) {
> + /* Add range within existing zone limits, if possible */
> + zone_start_pfn = zone->zone_start_pfn;
> + zone_end_pfn = zone->zone_start_pfn +
> + zone->spanned_pages;
> + } else {
> + /* Add remaining range to ZONE_MOVABLE */
> + zone_start_pfn = start_pfn;
> + zone_end_pfn = start_pfn + size_pages;
> + }
> + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
> + continue;
> + nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
> + zone_end_pfn - start_pfn : size_pages;
> + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> + if (rc)
> + break;
> + start_pfn += nr_pages;
> + size_pages -= nr_pages;
> + if (!size_pages)
> + break;
> + }
> if (rc)
> vmem_remove_mapping(start, size);
> return rc;
> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
> index bf726af..a9d57f7 100644
> --- a/arch/sh/mm/init.c
> +++ b/arch/sh/mm/init.c
> @@ -485,14 +485,20 @@ void free_initrd_mem(unsigned long start, unsigned long end)
> #endif
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + pg_data_t *pgdat;
> unsigned long start_pfn = PFN_DOWN(start);
> unsigned long nr_pages = size >> PAGE_SHIFT;
> int ret;
>
> + pgdat = NODE_DATA(nid);
> +
> /* We only have ZONE_NORMAL, so this is easy.. */
> - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> + ret = __add_pages(nid, pgdat->node_zones +
> + zone_for_memory(nid, start, size, ZONE_NORMAL,
> + for_device),
> + start_pfn, nr_pages, !for_device);
> if (unlikely(ret))
> printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
>
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 135c9a7..e7bae21 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -829,12 +829,15 @@ void __init mem_init(void)
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + struct pglist_data *pgdata = NODE_DATA(nid);
> + struct zone *zone = pgdata->node_zones +
> + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
> unsigned long start_pfn = start >> PAGE_SHIFT;
> unsigned long nr_pages = size >> PAGE_SHIFT;
>
> - return __add_pages(nid, start_pfn, nr_pages, want_memblock);
> + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> }
>
> #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 902983c..456b452 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -772,15 +772,22 @@ static void update_end_of_memory_vars(u64 start, u64 size)
> }
> }
>
> -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
> +/*
> + * Memory is added always to NORMAL zone. This means you will never get
> + * additional DMA/DMA32 memory.
> + */
> +int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> {
> + struct pglist_data *pgdat = NODE_DATA(nid);
> + struct zone *zone = pgdat->node_zones +
> + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
> unsigned long start_pfn = start >> PAGE_SHIFT;
> unsigned long nr_pages = size >> PAGE_SHIFT;
> int ret;
>
> init_memory_mapping(start, start + size);
>
> - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
> + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
> WARN_ON_ONCE(ret);
>
> /* update max_pfn, max_low_pfn and high_memory */
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index c7c4e03..1e884d8 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -392,43 +392,39 @@ static ssize_t show_valid_zones(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> struct memory_block *mem = to_memory_block(dev);
> - unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
> + unsigned long start_pfn, end_pfn;
> + unsigned long valid_start, valid_end, valid_pages;
> unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
> - unsigned long valid_start_pfn, valid_end_pfn;
> - bool append = false;
> - int nid;
> + struct zone *zone;
> + int zone_shift = 0;
>
> - /*
> - * The block contains more than one zone can not be offlined.
> - * This can happen e.g. for ZONE_DMA and ZONE_DMA32
> - */
> - if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
> + start_pfn = section_nr_to_pfn(mem->start_section_nr);
> + end_pfn = start_pfn + nr_pages;
> +
> + /* The block contains more than one zone can not be offlined. */
> + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
> return sprintf(buf, "none\n");
>
> - start_pfn = valid_start_pfn;
> - nr_pages = valid_end_pfn - start_pfn;
> + zone = page_zone(pfn_to_page(valid_start));
> + valid_pages = valid_end - valid_start;
>
> - /*
> - * Check the existing zone. Make sure that we do that only on the
> - * online nodes otherwise the page_zone is not reliable
> - */
> - if (mem->state == MEM_ONLINE) {
> - strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
> - goto out;
> - }
> + /* MMOP_ONLINE_KEEP */
> + sprintf(buf, "%s", zone->name);
>
> - nid = pfn_to_nid(start_pfn);
> - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
> - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
> - append = true;
> + /* MMOP_ONLINE_KERNEL */
> + zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift);
> + if (zone_shift) {
> + strcat(buf, " ");
> + strcat(buf, (zone + zone_shift)->name);
> }
>
> - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
> - if (append)
> - strcat(buf, " ");
> - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
> + /* MMOP_ONLINE_MOVABLE */
> + zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift);
> + if (zone_shift) {
> + strcat(buf, " ");
> + strcat(buf, (zone + zone_shift)->name);
> }
> -out:
> +
> strcat(buf, "\n");
>
> return strlen(buf);
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index c8a5056..b18a1b8 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -129,8 +129,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
> unsigned long nr_pages);
> #endif /* CONFIG_MEMORY_HOTREMOVE */
>
> -/* reasonably generic interface to expand the physical pages */
> -extern int __add_pages(int nid, unsigned long start_pfn,
> +/* reasonably generic interface to expand the physical pages in a zone */
> +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
> unsigned long nr_pages, bool want_memblock);
>
> #ifdef CONFIG_NUMA
> @@ -306,19 +306,18 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
> void *arg, int (*func)(struct memory_block *, void *));
> extern int add_memory(int nid, u64 start, u64 size);
> extern int add_memory_resource(int nid, struct resource *resource, bool online);
> -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock);
> -extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
> - unsigned long nr_pages);
> +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
> + bool for_device);
> +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
> extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
> extern bool is_memblock_offlined(struct memory_block *mem);
> extern void remove_memory(int nid, u64 start, u64 size);
> -extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn);
> +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
> extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
> unsigned long map_offset);
> extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
> unsigned long pnum);
> -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
> - int online_type);
> -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
> - unsigned long nr_pages);
> +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
> + enum zone_type target, int *zone_shift);
> +
> #endif /* __LINUX_MEMORY_HOTPLUG_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 9c6c001..ceb86e9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -533,22 +533,6 @@ static inline bool zone_is_empty(struct zone *zone)
> }
>
> /*
> - * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
> - * intersection with the given zone
> - */
> -static inline bool zone_intersects(struct zone *zone,
> - unsigned long start_pfn, unsigned long nr_pages)
> -{
> - if (zone_is_empty(zone))
> - return false;
> - if (start_pfn >= zone_end_pfn(zone) ||
> - start_pfn + nr_pages <= zone->zone_start_pfn)
> - return false;
> -
> - return true;
> -}
> -
> -/*
> * The "priority" of VM scanning is how much of the queues we will scan in one
> * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
> * queues ("queue_length >> 12") during an aging round.
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index 124bed7..23a6483 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -358,11 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
> goto err_pfn_remap;
>
> mem_hotplug_begin();
> - error = arch_add_memory(nid, align_start, align_size, false);
> - if (!error)
> - move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
> - align_start >> PAGE_SHIFT,
> - align_size >> PAGE_SHIFT);
> + error = arch_add_memory(nid, align_start, align_size, true);
> mem_hotplug_done();
> if (error)
> goto err_add_memory;
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 8dccc31..a4724e7 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -243,35 +243,216 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
> }
> #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
>
> -static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
> - bool want_memblock)
> +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + unsigned long old_zone_end_pfn;
> +
> + zone_span_writelock(zone);
> +
> + old_zone_end_pfn = zone_end_pfn(zone);
> + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
> + zone->zone_start_pfn = start_pfn;
> +
> + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
> + zone->zone_start_pfn;
> +
> + zone_span_writeunlock(zone);
> +}
> +
> +static void resize_zone(struct zone *zone, unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + zone_span_writelock(zone);
> +
> + if (end_pfn - start_pfn) {
> + zone->zone_start_pfn = start_pfn;
> + zone->spanned_pages = end_pfn - start_pfn;
> + } else {
> + /*
> + * make it consist as free_area_init_core(),
> + * if spanned_pages = 0, then keep start_pfn = 0
> + */
> + zone->zone_start_pfn = 0;
> + zone->spanned_pages = 0;
> + }
> +
> + zone_span_writeunlock(zone);
> +}
> +
> +static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + enum zone_type zid = zone_idx(zone);
> + int nid = zone->zone_pgdat->node_id;
> + unsigned long pfn;
> +
> + for (pfn = start_pfn; pfn < end_pfn; pfn++)
> + set_page_links(pfn_to_page(pfn), zid, nid, pfn);
> +}
> +
> +static void __ref ensure_zone_is_initialized(struct zone *zone,
> + unsigned long start_pfn, unsigned long num_pages)
> +{
> + if (!zone_is_initialized(zone))
> + init_currently_empty_zone(zone, start_pfn, num_pages);
> +}
> +
> +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
> + unsigned long start_pfn, unsigned long end_pfn)
> +{
> + unsigned long flags;
> + unsigned long z1_start_pfn;
> +
> + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
> +
> + pgdat_resize_lock(z1->zone_pgdat, &flags);
> +
> + /* can't move pfns which are higher than @z2 */
> + if (end_pfn > zone_end_pfn(z2))
> + goto out_fail;
> + /* the move out part must be at the left most of @z2 */
> + if (start_pfn > z2->zone_start_pfn)
> + goto out_fail;
> + /* must included/overlap */
> + if (end_pfn <= z2->zone_start_pfn)
> + goto out_fail;
> +
> + /* use start_pfn for z1's start_pfn if z1 is empty */
> + if (!zone_is_empty(z1))
> + z1_start_pfn = z1->zone_start_pfn;
> + else
> + z1_start_pfn = start_pfn;
> +
> + resize_zone(z1, z1_start_pfn, end_pfn);
> + resize_zone(z2, end_pfn, zone_end_pfn(z2));
> +
> + pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +
> + fix_zone_id(z1, start_pfn, end_pfn);
> +
> + return 0;
> +out_fail:
> + pgdat_resize_unlock(z1->zone_pgdat, &flags);
> + return -1;
> +}
> +
> +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
> + unsigned long start_pfn, unsigned long end_pfn)
> +{
> + unsigned long flags;
> + unsigned long z2_end_pfn;
> +
> + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
> +
> + pgdat_resize_lock(z1->zone_pgdat, &flags);
> +
> + /* can't move pfns which are lower than @z1 */
> + if (z1->zone_start_pfn > start_pfn)
> + goto out_fail;
> + /* the move out part mast at the right most of @z1 */
> + if (zone_end_pfn(z1) > end_pfn)
> + goto out_fail;
> + /* must included/overlap */
> + if (start_pfn >= zone_end_pfn(z1))
> + goto out_fail;
> +
> + /* use end_pfn for z2's end_pfn if z2 is empty */
> + if (!zone_is_empty(z2))
> + z2_end_pfn = zone_end_pfn(z2);
> + else
> + z2_end_pfn = end_pfn;
> +
> + resize_zone(z1, z1->zone_start_pfn, start_pfn);
> + resize_zone(z2, start_pfn, z2_end_pfn);
> +
> + pgdat_resize_unlock(z1->zone_pgdat, &flags);
> +
> + fix_zone_id(z2, start_pfn, end_pfn);
> +
> + return 0;
> +out_fail:
> + pgdat_resize_unlock(z1->zone_pgdat, &flags);
> + return -1;
> +}
> +
> +static struct zone * __meminit move_pfn_range(int zone_shift,
> + unsigned long start_pfn, unsigned long end_pfn)
> +{
> + struct zone *zone = page_zone(pfn_to_page(start_pfn));
> + int ret = 0;
> +
> + if (zone_shift < 0)
> + ret = move_pfn_range_left(zone + zone_shift, zone,
> + start_pfn, end_pfn);
> + else if (zone_shift)
> + ret = move_pfn_range_right(zone, zone + zone_shift,
> + start_pfn, end_pfn);
> +
> + if (ret)
> + return NULL;
> +
> + return zone + zone_shift;
> +}
> +
> +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
> + unsigned long end_pfn)
> +{
> + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
> +
> + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
> + pgdat->node_start_pfn = start_pfn;
> +
> + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
> + pgdat->node_start_pfn;
> +}
> +
> +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
> +{
> + struct pglist_data *pgdat = zone->zone_pgdat;
> + int nr_pages = PAGES_PER_SECTION;
> + int nid = pgdat->node_id;
> + int zone_type;
> + unsigned long flags, pfn;
> +
> + zone_type = zone - pgdat->node_zones;
> + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
> +
> + pgdat_resize_lock(zone->zone_pgdat, &flags);
> + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
> + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
> + phys_start_pfn + nr_pages);
> + pgdat_resize_unlock(zone->zone_pgdat, &flags);
> + memmap_init_zone(nr_pages, nid, zone_type,
> + phys_start_pfn, MEMMAP_HOTPLUG);
> +
> + /* online_page_range is called later and expects pages reserved */
> + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
> + if (!pfn_valid(pfn))
> + continue;
> +
> + SetPageReserved(pfn_to_page(pfn));
> + }
> + return 0;
> +}
> +
> +static int __meminit __add_section(int nid, struct zone *zone,
> + unsigned long phys_start_pfn, bool want_memblock)
> {
> int ret;
> - int i;
>
> if (pfn_valid(phys_start_pfn))
> return -EEXIST;
>
> - ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
> + ret = sparse_add_one_section(zone, phys_start_pfn);
> +
> if (ret < 0)
> return ret;
>
> - /*
> - * Make all the pages reserved so that nobody will stumble over half
> - * initialized state.
> - * FIXME: We also have to associate it with a node because pfn_to_node
> - * relies on having page with the proper node.
> - */
> - for (i = 0; i < PAGES_PER_SECTION; i++) {
> - unsigned long pfn = phys_start_pfn + i;
> - struct page *page;
> - if (!pfn_valid(pfn))
> - continue;
> + ret = __add_zone(zone, phys_start_pfn);
>
> - page = pfn_to_page(pfn);
> - set_page_node(page, nid);
> - SetPageReserved(page);
> - }
> + if (ret < 0)
> + return ret;
>
> if (!want_memblock)
> return 0;
> @@ -285,7 +466,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
> * call this function after deciding the zone to which to
> * add the new pages.
> */
> -int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
> unsigned long nr_pages, bool want_memblock)
> {
> unsigned long i;
> @@ -293,6 +474,8 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> int start_sec, end_sec;
> struct vmem_altmap *altmap;
>
> + clear_zone_contiguous(zone);
> +
> /* during initialize mem_map, align hot-added range to section */
> start_sec = pfn_to_section_nr(phys_start_pfn);
> end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
> @@ -312,7 +495,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> }
>
> for (i = start_sec; i <= end_sec; i++) {
> - err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
> + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock);
>
> /*
> * EEXIST is finally dealt with by ioresource collision
> @@ -325,6 +508,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> }
> vmemmap_populate_print_last();
> out:
> + set_zone_contiguous(zone);
> return err;
> }
> EXPORT_SYMBOL_GPL(__add_pages);
> @@ -699,6 +883,23 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> return 0;
> }
>
> +#ifdef CONFIG_MOVABLE_NODE
> +/*
> + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
> + * normal memory.
> + */
> +static bool can_online_high_movable(int nid)
> +{
> + return true;
> +}
> +#else /* CONFIG_MOVABLE_NODE */
> +/* ensure every online node has NORMAL memory */
> +static bool can_online_high_movable(int nid)
> +{
> + return node_state(nid, N_NORMAL_MEMORY);
> +}
> +#endif /* CONFIG_MOVABLE_NODE */
> +
> /* check which state of node_states will be changed when online memory */
> static void node_states_check_changes_online(unsigned long nr_pages,
> struct zone *zone, struct memory_notify *arg)
> @@ -773,144 +974,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
> node_set_state(node, N_MEMORY);
> }
>
> -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
> +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
> + enum zone_type target, int *zone_shift)
> {
> - struct pglist_data *pgdat = NODE_DATA(nid);
> - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
> - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
> -
> - /*
> - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
> - * physically before ZONE_MOVABLE. All we need is they do not
> - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
> - * though so let's stick with it for simplicity for now.
> - * TODO make sure we do not overlap with ZONE_DEVICE
> - */
> - if (online_type == MMOP_ONLINE_KERNEL) {
> - if (zone_is_empty(movable_zone))
> - return true;
> - return movable_zone->zone_start_pfn >= pfn + nr_pages;
> - } else if (online_type == MMOP_ONLINE_MOVABLE) {
> - return zone_end_pfn(default_zone) <= pfn;
> - }
> -
> - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
> - return online_type == MMOP_ONLINE_KEEP;
> -}
> -
> -static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
> - unsigned long nr_pages)
> -{
> - unsigned long old_end_pfn = zone_end_pfn(zone);
> -
> - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
> - zone->zone_start_pfn = start_pfn;
> -
> - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
> -}
> -
> -static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
> - unsigned long nr_pages)
> -{
> - unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
> -
> - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
> - pgdat->node_start_pfn = start_pfn;
> -
> - pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
> -}
> -
> -void __ref move_pfn_range_to_zone(struct zone *zone,
> - unsigned long start_pfn, unsigned long nr_pages)
> -{
> - struct pglist_data *pgdat = zone->zone_pgdat;
> - int nid = pgdat->node_id;
> - unsigned long flags;
> -
> - if (zone_is_empty(zone))
> - init_currently_empty_zone(zone, start_pfn, nr_pages);
> -
> - clear_zone_contiguous(zone);
> -
> - /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
> - pgdat_resize_lock(pgdat, &flags);
> - zone_span_writelock(zone);
> - resize_zone_range(zone, start_pfn, nr_pages);
> - zone_span_writeunlock(zone);
> - resize_pgdat_range(pgdat, start_pfn, nr_pages);
> - pgdat_resize_unlock(pgdat, &flags);
> -
> - /*
> - * TODO now we have a visible range of pages which are not associated
> - * with their zone properly. Not nice but set_pfnblock_flags_mask
> - * expects the zone spans the pfn range. All the pages in the range
> - * are reserved so nobody should be touching them so we should be safe
> - */
> - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
> -
> - set_zone_contiguous(zone);
> -}
> + struct zone *zone = page_zone(pfn_to_page(pfn));
> + enum zone_type idx = zone_idx(zone);
> + int i;
>
> -/*
> - * Returns a default kernel memory zone for the given pfn range.
> - * If no kernel zone covers this pfn range it will automatically go
> - * to the ZONE_NORMAL.
> - */
> -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
> - unsigned long nr_pages)
> -{
> - struct pglist_data *pgdat = NODE_DATA(nid);
> - int zid;
> + *zone_shift = 0;
>
> - for (zid = 0; zid <= ZONE_NORMAL; zid++) {
> - struct zone *zone = &pgdat->node_zones[zid];
> + if (idx < target) {
> + /* pages must be at end of current zone */
> + if (pfn + nr_pages != zone_end_pfn(zone))
> + return false;
>
> - if (zone_intersects(zone, start_pfn, nr_pages))
> - return zone;
> + /* no zones in use between current zone and target */
> + for (i = idx + 1; i < target; i++)
> + if (zone_is_initialized(zone - idx + i))
> + return false;
> }
>
> - return &pgdat->node_zones[ZONE_NORMAL];
> -}
> -
> -static inline bool movable_pfn_range(int nid, struct zone *default_zone,
> - unsigned long start_pfn, unsigned long nr_pages)
> -{
> - if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
> - MMOP_ONLINE_KERNEL))
> - return true;
> -
> - if (!movable_node_is_enabled())
> - return false;
> -
> - return !zone_intersects(default_zone, start_pfn, nr_pages);
> -}
> -
> -/*
> - * Associates the given pfn range with the given node and the zone appropriate
> - * for the given online type.
> - */
> -static struct zone * __meminit move_pfn_range(int online_type, int nid,
> - unsigned long start_pfn, unsigned long nr_pages)
> -{
> - struct pglist_data *pgdat = NODE_DATA(nid);
> - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
> + if (target < idx) {
> + /* pages must be at beginning of current zone */
> + if (pfn != zone->zone_start_pfn)
> + return false;
>
> - if (online_type == MMOP_ONLINE_KEEP) {
> - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
> - /*
> - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
> - * movable zone if that is not possible (e.g. we are within
> - * or past the existing movable zone). movable_node overrides
> - * this default and defaults to movable zone
> - */
> - if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
> - zone = movable_zone;
> - } else if (online_type == MMOP_ONLINE_MOVABLE) {
> - zone = &pgdat->node_zones[ZONE_MOVABLE];
> + /* no zones in use between current zone and target */
> + for (i = target + 1; i < idx; i++)
> + if (zone_is_initialized(zone - idx + i))
> + return false;
> }
>
> - move_pfn_range_to_zone(zone, start_pfn, nr_pages);
> - return zone;
> + *zone_shift = target - idx;
> + return true;
> }
>
> /* Must be protected by mem_hotplug_begin() */
> @@ -923,18 +1019,38 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
> int nid;
> int ret;
> struct memory_notify arg;
> + int zone_shift = 0;
>
> - nid = pfn_to_nid(pfn);
> - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
> + /*
> + * This doesn't need a lock to do pfn_to_page().
> + * The section can't be removed here because of the
> + * memory_block->state_mutex.
> + */
> + zone = page_zone(pfn_to_page(pfn));
> +
> + if ((zone_idx(zone) > ZONE_NORMAL ||
> + online_type == MMOP_ONLINE_MOVABLE) &&
> + !can_online_high_movable(pfn_to_nid(pfn)))
> return -EINVAL;
>
> - /* associate pfn range with the zone */
> - zone = move_pfn_range(online_type, nid, pfn, nr_pages);
> + if (online_type == MMOP_ONLINE_KERNEL) {
> + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
> + return -EINVAL;
> + } else if (online_type == MMOP_ONLINE_MOVABLE) {
> + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
> + return -EINVAL;
> + }
> +
> + zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
> + if (!zone)
> + return -EINVAL;
>
> arg.start_pfn = pfn;
> arg.nr_pages = nr_pages;
> node_states_check_changes_online(nr_pages, zone, &arg);
>
> + nid = zone_to_nid(zone);
> +
> ret = memory_notify(MEM_GOING_ONLINE, &arg);
> ret = notifier_to_errno(ret);
> if (ret)
> @@ -1129,6 +1245,39 @@ static int check_hotplug_memory_range(u64 start, u64 size)
> return 0;
> }
>
> +/*
> + * If movable zone has already been setup, newly added memory should be check.
> + * If its address is higher than movable zone, it should be added as movable.
> + * Without this check, movable zone may overlap with other zone.
> + */
> +static int should_add_memory_movable(int nid, u64 start, u64 size)
> +{
> + unsigned long start_pfn = start >> PAGE_SHIFT;
> + pg_data_t *pgdat = NODE_DATA(nid);
> + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
> +
> + if (zone_is_empty(movable_zone))
> + return 0;
> +
> + if (movable_zone->zone_start_pfn <= start_pfn)
> + return 1;
> +
> + return 0;
> +}
> +
> +int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
> + bool for_device)
> +{
> +#ifdef CONFIG_ZONE_DEVICE
> + if (for_device)
> + return ZONE_DEVICE;
> +#endif
> + if (should_add_memory_movable(nid, start, size))
> + return ZONE_MOVABLE;
> +
> + return zone_default;
> +}
> +
> static int online_memory_block(struct memory_block *mem, void *arg)
> {
> return device_online(&mem->dev);
> diff --git a/mm/sparse.c b/mm/sparse.c
> index 9c48e4f..293cb5d 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -776,9 +776,10 @@ static void free_map_bootmem(struct page *memmap)
> * set. If this is <=0, then that means that the passed-in
> * map was not consumed and must be freed.
> */
> -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
> +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
> {
> unsigned long section_nr = pfn_to_section_nr(start_pfn);
> + struct pglist_data *pgdat = zone->zone_pgdat;
> struct mem_section *ms;
> struct page *memmap;
> unsigned long *usemap;
I don't really get this code, but it's a revert and has been thoroughly tested and discussed, so:
Acked-by: Khalid Elmously <khalid.elmously at canonical.com>
More information about the kernel-team
mailing list