diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-05 16:32:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-05 16:32:45 -0700 |
commit | 6614a3c3164a5df2b54abb0b3559f51041cf705b (patch) | |
tree | 1c25c23d9efed988705287fc2ccb78e0e76e311d /mm/huge_memory.c | |
parent | 74cae210a335d159f2eb822e261adee905b6951a (diff) | |
parent | 360614c01f81f48a89d8b13f8fa69c3ae0a1f5c7 (diff) | |
download | linux-6614a3c3164a5df2b54abb0b3559f51041cf705b.tar.gz |
Merge tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Most of the MM queue. A few things are still pending.
Liam's maple tree rework didn't make it. This has resulted in a few
other minor patch series being held over for next time.
Multi-gen LRU still isn't merged as we were waiting for mapletree to
stabilize. The current plan is to merge MGLRU into -mm soon and to
later reintroduce mapletree, with a view to hopefully getting both
into 6.1-rc1.
Summary:
- The usual batches of cleanups from Baoquan He, Muchun Song, Miaohe
Lin, Yang Shi, Anshuman Khandual and Mike Rapoport
- Some kmemleak fixes from Patrick Wang and Waiman Long
- DAMON updates from SeongJae Park
- memcg debug/visibility work from Roman Gushchin
- vmalloc speedup from Uladzislau Rezki
- more folio conversion work from Matthew Wilcox
- enhancements for coherent device memory mapping from Alex Sierra
- addition of shared pages tracking and CoW support for fsdax, from
Shiyang Ruan
- hugetlb optimizations from Mike Kravetz
- Mel Gorman has contributed some pagealloc changes to improve
latency and realtime behaviour.
- mprotect soft-dirty checking has been improved by Peter Xu
- Many other singleton patches all over the place"
[ XFS merge from hell as per Darrick Wong in
https://lore.kernel.org/all/YshKnxb4VwXycPO8@magnolia/ ]
* tag 'mm-stable-2022-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (282 commits)
tools/testing/selftests/vm/hmm-tests.c: fix build
mm: Kconfig: fix typo
mm: memory-failure: convert to pr_fmt()
mm: use is_zone_movable_page() helper
hugetlbfs: fix inaccurate comment in hugetlbfs_statfs()
hugetlbfs: cleanup some comments in inode.c
hugetlbfs: remove unneeded header file
hugetlbfs: remove unneeded hugetlbfs_ops forward declaration
hugetlbfs: use helper macro SZ_1{K,M}
mm: cleanup is_highmem()
mm/hmm: add a test for cross device private faults
selftests: add soft-dirty into run_vmtests.sh
selftests: soft-dirty: add test for mprotect
mm/mprotect: fix soft-dirty check in can_change_pte_writable()
mm: memcontrol: fix potential oom_lock recursion deadlock
mm/gup.c: fix formatting in check_and_migrate_movable_page()
xfs: fail dax mount if reflink is enabled on a partition
mm/memcontrol.c: remove the redundant updating of stats_flush_threshold
userfaultfd: don't fail on unrecognized features
hugetlb_cgroup: fix wrong hugetlb cgroup numa stat
...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 186 |
1 files changed, 117 insertions, 69 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 15965084816d..8a7c1b344abe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -70,21 +70,85 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_active(struct vm_area_struct *vma) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps, bool in_pf) { - /* The addr is used to check if the vma size fits */ - unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + if (!vma->vm_mm) /* vdso */ + return false; + + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; - if (!transhuge_vma_suitable(vma, addr)) + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ + if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ + if (vm_flags & VM_NO_KHUGEPAGED) return false; - if (vma_is_anonymous(vma)) - return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma)) + + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) + return false; + + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + + if (!hugepage_flags_enabled()) + return false; + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + return false; + + /* Only regular file is valid */ + if (!in_pf && file_thp_enabled(vma)) return true; - return false; + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. + */ + if (!vma->anon_vma) + return (smaps || in_pf); + + return true; } static bool get_huge_zero_page(void) @@ -213,8 +277,8 @@ static ssize_t enabled_store(struct kobject *kobj, } return ret; } -static struct kobj_attribute enabled_attr = - __ATTR(enabled, 0644, enabled_show, enabled_store); + +static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, @@ -303,8 +367,7 @@ static ssize_t defrag_store(struct kobject *kobj, return count; } -static struct kobj_attribute defrag_attr = - __ATTR(defrag, 0644, defrag_show, defrag_store); +static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -318,8 +381,7 @@ static ssize_t use_zero_page_store(struct kobject *kobj, return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } -static struct kobj_attribute use_zero_page_attr = - __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); +static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); static ssize_t hpage_pmd_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -424,10 +486,10 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker); + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); if (err) goto err_split_shrinker; @@ -520,7 +582,7 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) void prep_transhuge_page(struct page *page) { /* - * we use page->mapping and page->indexlru in second tail page + * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 */ @@ -727,7 +789,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && @@ -957,15 +1019,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, bool write) { pmd_t _pmd; _pmd = pmd_mkyoung(*pmd); - if (flags & FOLL_WRITE) + if (write) _pmd = pmd_mkdirty(_pmd); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, flags & FOLL_WRITE)) + pmd, _pmd, write)) update_mmu_cache_pmd(vma, addr, pmd); } @@ -998,7 +1060,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags); + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the @@ -1121,15 +1183,15 @@ out: #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, bool write) { pud_t _pud; _pud = pud_mkyoung(*pud); - if (flags & FOLL_WRITE) + if (write) _pud = pud_mkdirty(_pud); if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, - pud, _pud, flags & FOLL_WRITE)) + pud, _pud, write)) update_mmu_cache_pud(vma, addr, pud); } @@ -1156,7 +1218,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pud, flags); + touch_pud(vma, addr, pud, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the @@ -1221,21 +1283,13 @@ out_unlock: void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { - pud_t entry; - unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); if (unlikely(!pud_same(*vmf->pud, orig_pud))) goto unlock; - entry = pud_mkyoung(orig_pud); - if (write) - entry = pud_mkdirty(entry); - haddr = vmf->address & HPAGE_PUD_MASK; - if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) - update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); - + touch_pud(vmf->vma, vmf->address, vmf->pud, write); unlock: spin_unlock(vmf->ptl); } @@ -1243,21 +1297,13 @@ unlock: void huge_pmd_set_accessed(struct vm_fault *vmf) { - pmd_t entry; - unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; - pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) goto unlock; - entry = pmd_mkyoung(orig_pmd); - if (write) - entry = pmd_mkdirty(entry); - haddr = vmf->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) - update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); + touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); unlock: spin_unlock(vmf->ptl); @@ -1393,7 +1439,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-ENOMEM); if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags); + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); @@ -1686,7 +1732,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) - flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); @@ -1843,10 +1889,10 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) } /* - * Returns true if a given pud maps a thp, false otherwise. + * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. * - * Note that if it returns true, this routine returns without unlocking page - * table lock. So callers must unlock it. + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { @@ -1868,12 +1914,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; - /* - * For architectures like ppc64 we look at deposited pgtable - * when calling pudp_huge_get_and_clear. So do the - * pgtable_trans_huge_withdraw after finishing pudp related - * operations. - */ + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { @@ -1938,7 +1979,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); @@ -2195,6 +2236,10 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)) { + /* + * It's safe to call pmd_page when folio is set because it's + * guaranteed that pmd is present. + */ if (folio && folio != page_folio(pmd_page(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); @@ -2502,7 +2547,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ - put_page(subpage); + free_page_and_swap_cache(subpage); } } @@ -2821,9 +2866,12 @@ static void split_huge_pages_all(void) unsigned long total = 0, split = 0; pr_debug("Split all THPs\n"); - for_each_populated_zone(zone) { + for_each_zone(zone) { + if (!managed_zone(zone)) + continue; max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + int nr_pages; if (!pfn_valid(pfn)) continue; @@ -2839,8 +2887,10 @@ static void split_huge_pages_all(void) total++; lock_page(page); + nr_pages = thp_nr_pages(page); if (!split_huge_page(page)) split++; + pfn += nr_pages - 1; unlock_page(page); next: put_page(page); @@ -2898,10 +2948,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, * table filled with PTE-mapped THPs, each of which is distinct. */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { - struct vm_area_struct *vma = find_vma(mm, addr); + struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; - if (!vma || addr < vma->vm_start) + if (!vma) break; /* skip special VMA and hugetlb VMA */ @@ -2913,9 +2963,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR(page)) - continue; - if (!page) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) continue; if (!is_transparent_hugepage(page)) @@ -3137,7 +3185,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; - unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; swp_entry_t entry; @@ -3146,7 +3194,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); - pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot))); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) @@ -3160,12 +3208,12 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (!is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; - page_add_anon_rmap(new, vma, mmun_start, rmap_flags); + page_add_anon_rmap(new, vma, haddr, rmap_flags); } else { page_add_file_rmap(new, vma, true); } VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); - set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + set_pmd_at(mm, haddr, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); |