[Acked] [PATCH Xenail SRU] s390/mm: fix asce_bits handling with dynamic pagetable levels
Andy Whitcroft
apw at canonical.com
Wed May 25 07:11:04 UTC 2016
On Mon, May 23, 2016 at 10:56:09AM -0600, Tim Gardner wrote:
> From: Gerald Schaefer <gerald.schaefer at de.ibm.com>
>
> BugLink: http://bugs.launchpad.net/bugs/1584827
>
> commit 723cacbd9dc79582e562c123a0bacf8bfc69e72a upstream.
>
> There is a race with multi-threaded applications between context switch and
> pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
> mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
> pagetable upgrade on another thread in crst_table_upgrade() could already
> have set new asce_bits, but not yet the new mm->pgd. This would result in a
> corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
> translation exception.
>
> Fix this by storing the complete asce instead of just the asce_bits, which
> can then be read atomically from switch_mm(), so that it either sees the
> old value or the new value, but no mixture. Both cases are OK. Having the
> old value would result in a page fault on access to the higher level memory,
> but the fault handler would see the new mm->pgd, if it was a valid access
> after the mmap on the other thread has completed. So as worst-case scenario
> we would have a page fault loop for the racing thread until the next time
> slice.
>
> Also remove dead code and simplify the upgrade/downgrade path, there are no
> upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
> There are also no concurrent upgrades, because the mmap_sem is held with
> down_write() in do_mmap, so the flush and table checks during upgrade can
> be removed.
>
> Reported-by: Michael Munday <munday at ca.ibm.com>
> Reviewed-by: Martin Schwidefsky <schwidefsky at de.ibm.com>
> Signed-off-by: Gerald Schaefer <gerald.schaefer at de.ibm.com>
> Signed-off-by: Martin Schwidefsky <schwidefsky at de.ibm.com>
> Signed-off-by: Greg Kroah-Hartman <gregkh at linuxfoundation.org>
>
> Signed-off-by: Tim Gardner <tim.gardner at canonical.com>
> ---
>
> This patch is coming in via v4.4.11, but I added a bug link so it will close out the Launchpad bug.
>
> arch/s390/include/asm/mmu.h | 2 +-
> arch/s390/include/asm/mmu_context.h | 28 +++++++++---
> arch/s390/include/asm/pgalloc.h | 4 +-
> arch/s390/include/asm/processor.h | 2 +-
> arch/s390/include/asm/tlbflush.h | 9 ++--
> arch/s390/mm/init.c | 3 +-
> arch/s390/mm/mmap.c | 6 +--
> arch/s390/mm/pgtable.c | 85 ++++++++++++-------------------------
> 8 files changed, 62 insertions(+), 77 deletions(-)
>
> diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
> index d29ad95..081b2ad 100644
> --- a/arch/s390/include/asm/mmu.h
> +++ b/arch/s390/include/asm/mmu.h
> @@ -11,7 +11,7 @@ typedef struct {
> spinlock_t list_lock;
> struct list_head pgtable_list;
> struct list_head gmap_list;
> - unsigned long asce_bits;
> + unsigned long asce;
> unsigned long asce_limit;
> unsigned long vdso_base;
> /* The mmu context allocates 4K page tables. */
> diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
> index e485817..22877c9 100644
> --- a/arch/s390/include/asm/mmu_context.h
> +++ b/arch/s390/include/asm/mmu_context.h
> @@ -26,12 +26,28 @@ static inline int init_new_context(struct task_struct *tsk,
> mm->context.has_pgste = 0;
> mm->context.use_skey = 0;
> #endif
> - if (mm->context.asce_limit == 0) {
> + switch (mm->context.asce_limit) {
> + case 1UL << 42:
> + /*
> + * forked 3-level task, fall through to set new asce with new
> + * mm->pgd
> + */
> + case 0:
> /* context created by exec, set asce limit to 4TB */
> - mm->context.asce_bits = _ASCE_TABLE_LENGTH |
> - _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
> mm->context.asce_limit = STACK_TOP_MAX;
> - } else if (mm->context.asce_limit == (1UL << 31)) {
> + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
> + _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
> + break;
> + case 1UL << 53:
> + /* forked 4-level task, set new asce with new mm->pgd */
> + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
> + _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
> + break;
> + case 1UL << 31:
> + /* forked 2-level compat task, set new asce with new mm->pgd */
> + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
> + _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
> + /* pgd_alloc() did not increase mm->nr_pmds */
> mm_inc_nr_pmds(mm);
> }
> crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
> @@ -42,7 +58,7 @@ static inline int init_new_context(struct task_struct *tsk,
>
> static inline void set_user_asce(struct mm_struct *mm)
> {
> - S390_lowcore.user_asce = mm->context.asce_bits | __pa(mm->pgd);
> + S390_lowcore.user_asce = mm->context.asce;
> if (current->thread.mm_segment.ar4)
> __ctl_load(S390_lowcore.user_asce, 7, 7);
> set_cpu_flag(CIF_ASCE);
> @@ -71,7 +87,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> {
> int cpu = smp_processor_id();
>
> - S390_lowcore.user_asce = next->context.asce_bits | __pa(next->pgd);
> + S390_lowcore.user_asce = next->context.asce;
> if (prev == next)
> return;
> if (MACHINE_HAS_TLB_LC)
> diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
> index d7cc79f..5991cdcb5 100644
> --- a/arch/s390/include/asm/pgalloc.h
> +++ b/arch/s390/include/asm/pgalloc.h
> @@ -56,8 +56,8 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm)
> return _REGION2_ENTRY_EMPTY;
> }
>
> -int crst_table_upgrade(struct mm_struct *, unsigned long limit);
> -void crst_table_downgrade(struct mm_struct *, unsigned long limit);
> +int crst_table_upgrade(struct mm_struct *);
> +void crst_table_downgrade(struct mm_struct *);
>
> static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
> {
> diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
> index b16c3d0..c1ea67d 100644
> --- a/arch/s390/include/asm/processor.h
> +++ b/arch/s390/include/asm/processor.h
> @@ -163,7 +163,7 @@ extern __vector128 init_task_fpu_regs[__NUM_VXRS];
> regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \
> regs->psw.addr = new_psw | PSW_ADDR_AMODE; \
> regs->gprs[15] = new_stackp; \
> - crst_table_downgrade(current->mm, 1UL << 31); \
> + crst_table_downgrade(current->mm); \
> execve_tail(); \
> } while (0)
>
> diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
> index ca148f7..a2e6ef3 100644
> --- a/arch/s390/include/asm/tlbflush.h
> +++ b/arch/s390/include/asm/tlbflush.h
> @@ -110,8 +110,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
> static inline void __tlb_flush_kernel(void)
> {
> if (MACHINE_HAS_IDTE)
> - __tlb_flush_idte((unsigned long) init_mm.pgd |
> - init_mm.context.asce_bits);
> + __tlb_flush_idte(init_mm.context.asce);
> else
> __tlb_flush_global();
> }
> @@ -133,8 +132,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
> static inline void __tlb_flush_kernel(void)
> {
> if (MACHINE_HAS_TLB_LC)
> - __tlb_flush_idte_local((unsigned long) init_mm.pgd |
> - init_mm.context.asce_bits);
> + __tlb_flush_idte_local(init_mm.context.asce);
> else
> __tlb_flush_local();
> }
> @@ -148,8 +146,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm)
> * only ran on the local cpu.
> */
> if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list))
> - __tlb_flush_asce(mm, (unsigned long) mm->pgd |
> - mm->context.asce_bits);
> + __tlb_flush_asce(mm, mm->context.asce);
> else
> __tlb_flush_full(mm);
> }
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index c722400..feff9ca 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -89,7 +89,8 @@ void __init paging_init(void)
> asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
> pgd_type = _REGION3_ENTRY_EMPTY;
> }
> - S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
> + init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
> + S390_lowcore.kernel_asce = init_mm.context.asce;
> clear_table((unsigned long *) init_mm.pgd, pgd_type,
> sizeof(unsigned long)*2048);
> vmem_map_init();
> diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
> index ea01477..f2b6b1d 100644
> --- a/arch/s390/mm/mmap.c
> +++ b/arch/s390/mm/mmap.c
> @@ -174,7 +174,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
> if (!(flags & MAP_FIXED))
> addr = 0;
> if ((addr + len) >= TASK_SIZE)
> - return crst_table_upgrade(current->mm, 1UL << 53);
> + return crst_table_upgrade(current->mm);
> return 0;
> }
>
> @@ -191,7 +191,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
> return area;
> if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
> /* Upgrade the page table to 4 levels and retry. */
> - rc = crst_table_upgrade(mm, 1UL << 53);
> + rc = crst_table_upgrade(mm);
> if (rc)
> return (unsigned long) rc;
> area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
> @@ -213,7 +213,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
> return area;
> if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < (1UL << 53)) {
> /* Upgrade the page table to 4 levels and retry. */
> - rc = crst_table_upgrade(mm, 1UL << 53);
> + rc = crst_table_upgrade(mm);
> if (rc)
> return (unsigned long) rc;
> area = arch_get_unmapped_area_topdown(filp, addr, len,
> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> index 54ef3bc..471a370 100644
> --- a/arch/s390/mm/pgtable.c
> +++ b/arch/s390/mm/pgtable.c
> @@ -49,81 +49,52 @@ static void __crst_table_upgrade(void *arg)
> __tlb_flush_local();
> }
>
> -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
> +int crst_table_upgrade(struct mm_struct *mm)
> {
> unsigned long *table, *pgd;
> - unsigned long entry;
> - int flush;
>
> - BUG_ON(limit > (1UL << 53));
> - flush = 0;
> -repeat:
> + /* upgrade should only happen from 3 to 4 levels */
> + BUG_ON(mm->context.asce_limit != (1UL << 42));
> +
> table = crst_table_alloc(mm);
> if (!table)
> return -ENOMEM;
> +
> spin_lock_bh(&mm->page_table_lock);
> - if (mm->context.asce_limit < limit) {
> - pgd = (unsigned long *) mm->pgd;
> - if (mm->context.asce_limit <= (1UL << 31)) {
> - entry = _REGION3_ENTRY_EMPTY;
> - mm->context.asce_limit = 1UL << 42;
> - mm->context.asce_bits = _ASCE_TABLE_LENGTH |
> - _ASCE_USER_BITS |
> - _ASCE_TYPE_REGION3;
> - } else {
> - entry = _REGION2_ENTRY_EMPTY;
> - mm->context.asce_limit = 1UL << 53;
> - mm->context.asce_bits = _ASCE_TABLE_LENGTH |
> - _ASCE_USER_BITS |
> - _ASCE_TYPE_REGION2;
> - }
> - crst_table_init(table, entry);
> - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
> - mm->pgd = (pgd_t *) table;
> - mm->task_size = mm->context.asce_limit;
> - table = NULL;
> - flush = 1;
> - }
> + pgd = (unsigned long *) mm->pgd;
> + crst_table_init(table, _REGION2_ENTRY_EMPTY);
> + pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
> + mm->pgd = (pgd_t *) table;
> + mm->context.asce_limit = 1UL << 53;
> + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
> + _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
> + mm->task_size = mm->context.asce_limit;
> spin_unlock_bh(&mm->page_table_lock);
> - if (table)
> - crst_table_free(mm, table);
> - if (mm->context.asce_limit < limit)
> - goto repeat;
> - if (flush)
> - on_each_cpu(__crst_table_upgrade, mm, 0);
> +
> + on_each_cpu(__crst_table_upgrade, mm, 0);
> return 0;
> }
>
> -void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
> +void crst_table_downgrade(struct mm_struct *mm)
> {
> pgd_t *pgd;
>
> + /* downgrade should only happen from 3 to 2 levels (compat only) */
> + BUG_ON(mm->context.asce_limit != (1UL << 42));
> +
> if (current->active_mm == mm) {
> clear_user_asce();
> __tlb_flush_mm(mm);
> }
> - while (mm->context.asce_limit > limit) {
> - pgd = mm->pgd;
> - switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
> - case _REGION_ENTRY_TYPE_R2:
> - mm->context.asce_limit = 1UL << 42;
> - mm->context.asce_bits = _ASCE_TABLE_LENGTH |
> - _ASCE_USER_BITS |
> - _ASCE_TYPE_REGION3;
> - break;
> - case _REGION_ENTRY_TYPE_R3:
> - mm->context.asce_limit = 1UL << 31;
> - mm->context.asce_bits = _ASCE_TABLE_LENGTH |
> - _ASCE_USER_BITS |
> - _ASCE_TYPE_SEGMENT;
> - break;
> - default:
> - BUG();
> - }
> - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
> - mm->task_size = mm->context.asce_limit;
> - crst_table_free(mm, (unsigned long *) pgd);
> - }
> +
> + pgd = mm->pgd;
> + mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
> + mm->context.asce_limit = 1UL << 31;
> + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
> + _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
> + mm->task_size = mm->context.asce_limit;
> + crst_table_free(mm, (unsigned long *) pgd);
> +
> if (current->active_mm == mm)
> set_user_asce(mm);
> }
This is already in stable, so I guess so:
Acked-by: Andy Whitcroft <apw at canonical.com>
-apw
More information about the kernel-team
mailing list