From 04a42e72d77a93a166b79c34b7bc862f55a53967 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Dec 2022 22:17:57 -0800 Subject: mm: move folio_set_compound_order() to mm/internal.h folio_set_compound_order() is moved to an mm-internal location so external folio users cannot misuse this function. Change the name of the function to folio_set_order() and use WARN_ON_ONCE() rather than BUG_ON. Also, handle the case if a non-large folio is passed and add clarifying comments to the function. Link: https://lore.kernel.org/lkml/20221207223731.32784-1-sidhartha.kumar@oracle.com/T/ Link: https://lkml.kernel.org/r/20221215061757.223440-1-sidhartha.kumar@oracle.com Fixes: 9fd330582b2f ("mm: add folio dtor and order setter functions") Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Suggested-by: Matthew Wilcox Suggested-by: John Hubbard Reviewed-by: John Hubbard Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/internal.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index bcf75a8b032d..1d6f4e168510 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -378,6 +378,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + /* + * When hugetlb dissolves a folio, we need to clear the tail + * page, rather than setting nr_pages to 1. + */ + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* -- cgit v1.2.1 From eec20426d48bd7b63c69969a793943ed1a99b731 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:48 +0000 Subject: mm: convert head_subpages_mapcount() into folio_nr_pages_mapped() Calling this 'mapcount' is confusing since mapcount is usually the number of times something is mapped; instead this is the number of mapped pages. It's also better to enforce that this is a folio rather than a head page. Move folio_nr_pages_mapped() into mm/internal.h since this is not something we want device drivers or filesystems poking at. Get rid of folio_subpages_mapcount_ptr() and use folio->_nr_pages_mapped directly. Link: https://lkml.kernel.org/r/20230111142915.1001531-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/internal.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 1d6f4e168510..583e15357e09 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; -- cgit v1.2.1 From 96f97c438f61ddba94117dcd1a1eb0aaafa22309 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:31 +0000 Subject: mm: mlock: update the interface to use folios Update the mlock interface to accept folios rather than pages, bringing the interface in line with the internal implementation. munlock_vma_page() still requires a page_folio() conversion, however this is consistent with the existent mlock_vma_page() implementation and a product of rmap still dealing in pages rather than folios. Link: https://lkml.kernel.org/r/cba12777c5544305014bc0cbec56bb4cc71477d8.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/internal.h | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 583e15357e09..973b48e8b1af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -533,10 +533,9 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -565,18 +564,25 @@ static inline void mlock_vma_page(struct page *page, mlock_vma_folio(page_folio(page), vma, compound); } -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); + +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); +} + +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + munlock_vma_folio(page_folio(page), vma, compound); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -665,10 +671,10 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } -- cgit v1.2.1 From 524c48072e5673f4511f1ad81493e2485863fd65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:12 +0000 Subject: mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE Patch series "Discard __GFP_ATOMIC", v3. Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm: discard __GFP_ATOMIC") for a long time and recently brought up again. Most recently, I was worried that __GFP_HIGH allocations could use high-order atomic reserves which is unintentional but there was no response so lets revisit -- this series reworks how min reserves are used, protects highorder reserves and then finishes with Neil's patch with very minor modifications so it fits on top. There was a review discussion on renaming __GFP_DIRECT_RECLAIM to __GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is orthogonal to the removal of __GFP_ATOMIC. There were some concerns about how the gfp flags affect the min reserves but it never reached a solid conclusion so I made my own attempt. The series tries to iron out some of the details on how reserves are used. ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes ALLOC_NON_BLOCK and documents how the reserves are affected. For example, ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min reserve. ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined allows deeper access again. ALLOC_OOM allows access to 75%. High-order atomic allocations are explicitly handled with the caveat that no __GFP_ATOMIC flag means that any high-order allocation that specifies GFP_HIGH and cannot enter direct reclaim will be treated as if it was GFP_ATOMIC. This patch (of 6): __GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it means. As ALLOC_HIGH is internal to the allocator, rename it to ALLOC_MIN_RESERVE to document that the min reserves can be depleted. Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 973b48e8b1af..99eb544fbded 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -779,7 +779,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #endif #define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 -- cgit v1.2.1 From eb2e2b425c6984ca8034448a3f2c680622bd3d4d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:14 +0000 Subject: mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags A high-order ALLOC_HARDER allocation is assumed to be atomic. While that is accurate, it changes later in the series. In preparation, explicitly record high-order atomic allocations in gfp_to_alloc_flags(). Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 99eb544fbded..62428467d7cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -789,6 +789,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ enum ttu_flags; -- cgit v1.2.1 From ab3508854353793cd35e348fde89a5c09b2fd8b5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:15 +0000 Subject: mm/page_alloc: explicitly define what alloc flags deplete min reserves As there are more ALLOC_ flags that affect reserves, define what flags affect reserves and clarify the effect of each flag. Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 62428467d7cf..f7c3bc80c475 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -792,6 +792,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; -- cgit v1.2.1 From 1ebbb21811b76c3b932959787f37985af36f62fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:16 +0000 Subject: mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague description. In preparation for the removal of GFP_ATOMIC redefine __GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to ALLOC_NON_BLOCK accordingly. __GFP_HIGH is required for access to reserves but non-blocking is granted more access. For example, GFP_NOWAIT is non-blocking but has no special access to reserves. A __GFP_NOFAIL blocking allocation is granted access similar to __GFP_HIGH if the only alternative is an OOM kill. Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index f7c3bc80c475..b0b88a95347f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -778,7 +778,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ @@ -793,7 +796,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ -#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; -- cgit v1.2.1 From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2023 11:12:17 +0000 Subject: mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - allows __GFP_HIGH allocations to ignore watermark boosting as well as GFP_ATOMIC requests. - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. [mgorman: Minor adjustments to rework on top of a series] Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net Signed-off-by: NeilBrown Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index b0b88a95347f..2d09a7a0600a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) -- cgit v1.2.1 From 90c9d13a47d45f2f16530c4d62af2fa4d74dfd16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:24 +0000 Subject: mm: remove page_evictable() Patch series "Remove leftover mlock/munlock page wrappers". We no longer need the various mlock page functions as all callers have folios. This patch (of 4): This function now has no users. Also update the unevictable-lru documentation to discuss folios instead of pages (mostly). [akpm@linux-foundation.org: fix Documentation/mm/unevictable-lru.rst underlining] Link: https://lkml.kernel.org/r/20230117145106.585b277b@canb.auug.org.au Link: https://lkml.kernel.org/r/20230116192827.2146732-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230116192827.2146732-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/internal.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 2d09a7a0600a..74bc1fe45711 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -159,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio) return ret; } -static inline bool page_evictable(struct page *page) -{ - bool ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. -- cgit v1.2.1 From 7efecffb8e7968c4a6c53177b0053ca4765fe233 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:25 +0000 Subject: mm: remove mlock_vma_page() All callers now have a folio and can call mlock_vma_folio(). Update the documentation to refer to mlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/internal.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 74bc1fe45711..0b74105ea363 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -518,7 +518,7 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* - * mlock_vma_page() and munlock_vma_page(): + * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * @@ -547,12 +547,6 @@ static inline void mlock_vma_folio(struct folio *folio, mlock_folio(folio); } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - mlock_vma_folio(page_folio(page), vma, compound); -} - void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, @@ -656,8 +650,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } -- cgit v1.2.1 From 672aa27d0bd241759376e62b78abb8aae1792479 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Jan 2023 19:28:26 +0000 Subject: mm: remove munlock_vma_page() All callers now have a folio and can call munlock_vma_folio(). Update the documentation to refer to munlock_vma_folio(). Link: https://lkml.kernel.org/r/20230116192827.2146732-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/internal.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 0b74105ea363..ce462bf145b4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -548,7 +548,6 @@ static inline void mlock_vma_folio(struct folio *folio, } void munlock_folio(struct folio *folio); - static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { @@ -557,11 +556,6 @@ static inline void munlock_vma_folio(struct folio *folio, munlock_folio(folio); } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - munlock_vma_folio(page_folio(page), vma, compound); -} void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); @@ -650,8 +644,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } -- cgit v1.2.1 From 48731c8436c68ce5597dfe72f3836bd6808bedde Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Jan 2023 13:44:31 +0000 Subject: mm, compaction: rename compact_control->rescan to finish_pageblock Patch series "Fix excessive CPU usage during compaction". Commit 7efc3b726103 ("mm/compaction: fix set skip in fast_find_migrateblock") fixed a problem where pageblocks found by fast_find_migrateblock() were ignored. Unfortunately there were numerous bug reports complaining about high CPU usage and massive stalls once 6.1 was released. Due to the severity, the patch was reverted by Vlastimil as a short-term fix[1] to -stable. The underlying problem for each of the bugs is suspected to be the repeated scanning of the same pageblocks. This series should guarantee forward progress even with commit 7efc3b726103. More information is in the changelog for patch 4. [1] http://lore.kernel.org/r/20230113173345.9692-1-vbabka@suse.cz This patch (of 4): The rescan field was not well named albeit accurate at the time. Rename the field to finish_pageblock to indicate that the remainder of the pageblock should be scanned regardless of COMPACT_CLUSTER_MAX. The intent is that pageblocks with transient failures get marked for skipping to avoid revisiting the same pageblock. Link: https://lkml.kernel.org/r/20230125134434.18017-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Chuyi Zhou Cc: Jiri Slaby Cc: Maxim Levitsky Cc: Michal Hocko Cc: Paolo Bonzini Cc: Pedro Falcato Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index ce462bf145b4..2d1b9fa8083e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -448,7 +448,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; -- cgit v1.2.1 From b62b633e048bbddef90b2e55d2e33823187b425f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:08 -0500 Subject: mm: expand vma iterator interface Add wrappers for the maple tree to the vma iterator. This will provide type safety at compile time. Link: https://lkml.kernel.org/r/20230120162650.984577-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 2d1b9fa8083e..ffd65248f266 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -877,4 +877,68 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} #endif /* __MM_INTERNAL_H */ -- cgit v1.2.1 From 440703e082b9c79c3d4fffcca8c2dffd621e6dc5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 20 Jan 2023 11:26:41 -0500 Subject: mm/mmap: refactor locking out of __vma_adjust() Move the locking into vma_prepare() and vma_complete() for use elsewhere Link: https://lkml.kernel.org/r/20230120162650.984577-41-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index ffd65248f266..90bb2078444c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -941,4 +941,18 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ -- cgit v1.2.1 From 7ce154fe6917e7db94d63bc4d6c73b678ad1c581 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:25 -0400 Subject: mm/gup: move try_grab_page() to mm/internal.h This is part of the internal function of gup.c and is only non-static so that the parts of gup.c in the huge_memory.c and hugetlb.c can call it. Put it in internal.h beside the similarly purposed try_grab_folio() Link: https://lkml.kernel.org/r/4-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 90bb2078444c..3f75763b0fc2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -856,6 +856,7 @@ int migrate_device_coherent_page(struct page *page); * mm/gup.c */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); extern bool mirrored_kernelcore; -- cgit v1.2.1 From 63b605128655f2e3968d99e30b293c7e7eaa2fc2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:33 -0400 Subject: mm/gup: move gup_must_unshare() to mm/internal.h This function is only used in gup.c and closely related. It touches FOLL_PIN so it must be moved before the next patch. Link: https://lkml.kernel.org/r/12-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Howells Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/internal.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 3f75763b0fc2..4f5ca3401b05 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,71 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +/* + * Indicates for which pages that are write-protected in the page table, + * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the + * GUP pin will remain consistent with the pages mapped into the page tables + * of the MM. + * + * Temporary unmapping of PageAnonExclusive() pages or clearing of + * PageAnonExclusive() has to protect against concurrent GUP: + * * Ordinary GUP: Using the PT lock + * * GUP-fast and fork(): mm->write_protect_seq + * * GUP-fast and KSM or temporary unmapping (swap, migration): see + * page_try_share_anon_rmap() + * + * Must be called with the (sub)page that's actually referenced via the + * page table entry, which might not necessarily be the head page for a + * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. + */ +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) +{ + /* + * FOLL_WRITE is implicitly handled correctly as the page table entry + * has to be writable -- and if it references (part of) an anonymous + * folio, that part is required to be marked exclusive. + */ + if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) + return false; + /* + * Note: PageAnon(page) is stable until the page is actually getting + * freed. + */ + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } + + /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_rmb(); + + /* + * Note that PageKsm() pages cannot be exclusive, and consequently, + * cannot get pinned. + */ + return !PageAnonExclusive(page); +} + extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) -- cgit v1.2.1 From 2c2241081f7dec878331fdc3a3f2361e99556bca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 24 Jan 2023 16:34:34 -0400 Subject: mm/gup: move private gup FOLL_ flags to internal.h Move the flags that should not/are not used outside gup.c and related into mm/internal.h to discourage driver abuse. To make this more maintainable going forward compact the two FOLL ranges with new bit numbers from 0 to 11 and 16 to 21, using shifts so it is explicit. Switch to an enum so the whole thing is easier to read. Link: https://lkml.kernel.org/r/13-v2-987e91b59705+36b-gup_tidy_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Acked-by: David Hildenbrand Cc: David Howells Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: Alistair Popple Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/internal.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 4f5ca3401b05..dfb37e94e140 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -858,6 +858,21 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +enum { + /* mark page accessed */ + FOLL_TOUCH = 1 << 16, + /* a retry, previous pass started an IO */ + FOLL_TRIED = 1 << 17, + /* we are working on non-current tsk/mm */ + FOLL_REMOTE = 1 << 18, + /* pages must be released via unpin_user_page */ + FOLL_PIN = 1 << 19, + /* gup_fast: prevent fall-back to slow gup */ + FOLL_FAST_ONLY = 1 << 20, + /* allow unlocking the mmap lock */ + FOLL_UNLOCKABLE = 1 << 21, +}; + /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the -- cgit v1.2.1 From be2d57563822b7e00b2b16d9354637c4b6d6d5cc Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:34 +0800 Subject: mm: change to return bool for folio_isolate_lru() Patch series "Change the return value for page isolation functions", v3. Now the page isolation functions did not return a boolean to indicate success or not, instead it will return a negative error when failed to isolate a page. So below code used in most places seem a boolean success/failure thing, which can confuse people whether the isolation is successful. if (folio_isolate_lru(folio)) continue; Moreover the page isolation functions only return 0 or -EBUSY, and most users did not care about the negative error except for few users, thus we can convert all page isolation functions to return a boolean value, which can remove the confusion to make code more clear. No functional changes intended in this patch series. This patch (of 4): Now the folio_isolate_lru() did not return a boolean value to indicate isolation success or not, however below code checking the return value can make people think that it was a boolean success/failure thing, which makes people easy to make mistakes (see the fix patch[1]). if (folio_isolate_lru(folio)) continue; Thus it's better to check the negative error value expilictly returned by folio_isolate_lru(), which makes code more clear per Linus's suggestion[2]. Moreover Matthew suggested we can convert the isolation functions to return a boolean[3], since most users did not care about the negative error value, and can also remove the confusing of checking return value. So this patch converts the folio_isolate_lru() to return a boolean value, which means return 'true' to indicate the folio isolation is successful, and 'false' means a failure to isolation. Meanwhile changing all users' logic of checking the isolation state. No functional changes intended. [1] https://lore.kernel.org/all/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com/T/#u [2] https://lore.kernel.org/all/CAHk-=wiBrY+O-4=2mrbVyxR+hOqfdJ=Do6xoucfJ9_5az01L4Q@mail.gmail.com/ [3] https://lore.kernel.org/all/Y+sTFqwMNAjDvxw3@casper.infradead.org/ Link: https://lkml.kernel.org/r/cover.1676424378.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/8a4e3679ed4196168efadf7ea36c038f2f7d5aa9.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index dfb37e94e140..8645e8496537 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, * in mm/vmscan.c: */ int isolate_lru_page(struct page *page); -int folio_isolate_lru(struct folio *folio); +bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); -- cgit v1.2.1 From f7f9c00dfafffd7a5a1a5685e2d874c64913e2ed Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 15 Feb 2023 18:39:35 +0800 Subject: mm: change to return bool for isolate_lru_page() The isolate_lru_page() can only return 0 or -EBUSY, and most users did not care about the negative error of isolate_lru_page(), except one user in add_page_for_migration(). So we can convert the isolate_lru_page() to return a boolean value, which can help to make the code more clear when checking the return value of isolate_lru_page(). Also convert all users' logic of checking the isolation state. No functional changes intended. Link: https://lkml.kernel.org/r/3074c1ab628d9dbf139b33f248a8bc253a3f95f0.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 8645e8496537..fc01fd092ea5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,7 +187,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, /* * in mm/vmscan.c: */ -int isolate_lru_page(struct page *page); +bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); -- cgit v1.2.1