diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2ccccbfcd532..36f4695aa604 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -503,6 +503,10 @@ struct mm_struct { * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; +#endif +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* See flush_tlb_batched_pending() */ + bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_X86_INTEL_MPX diff --git a/mm/internal.h b/mm/internal.h index 6979b2bd3227..f63f4393d633 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -453,6 +453,7 @@ struct tlbflush_unmap_batch; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); +void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { @@ -460,6 +461,8 @@ static inline void try_to_unmap_flush(void) static inline void try_to_unmap_flush_dirty(void) { } - +static inline void flush_tlb_batched_pending(struct mm_struct *mm) +{ +} #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index e6fa13484447..9ac55172aa7b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1127,6 +1127,7 @@ again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..c0b4b2a49462 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -72,6 +72,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!pte) return 0; + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { oldpte = *pte; diff --git a/mm/mremap.c b/mm/mremap.c index c25bc6268e46..fe7b7f65f4f4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, diff --git a/mm/rmap.c b/mm/rmap.c index b577fbb98d4b..ede183c32f45 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -648,6 +648,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); tlb_ubc->flush_required = true; + /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + mm->tlb_flush_batched = true; + /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() @@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + if (mm->tlb_flush_batched) { + flush_tlb_mm(mm); + + /* + * Do not allow the compiler to re-order the clearing of + * tlb_flush_batched before the tlb is flushed. + */ + barrier(); + mm->tlb_flush_batched = false; + } +} #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, struct page *page, bool writable)