diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/compaction.c | 20 | ||||
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 28 | ||||
-rw-r--r-- | mm/huge_memory.c | 20 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mempolicy.c | 92 | ||||
-rw-r--r-- | mm/migrate.c | 43 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 25 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 30 | ||||
-rw-r--r-- | mm/percpu.c | 208 | ||||
-rw-r--r-- | mm/process_vm_access.c | 26 | ||||
-rw-r--r-- | mm/rmap.c | 15 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 74 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 63 | ||||
-rw-r--r-- | mm/swapfile.c | 11 | ||||
-rw-r--r-- | mm/vmpressure.c | 1 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
25 files changed, 419 insertions, 308 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2d9f1504d75e..2888024e0b0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -575,5 +575,5 @@ config PGTABLE_MAPPING then you should select this. This causes zsmalloc to use page table mapping rather than copying for object mapping. - You can check speed with zsmalloc benchmark[1]. - [1] https://github.com/spartacus06/zsmalloc + You can check speed with zsmalloc benchmark: + https://github.com/spartacus06/zsmapbench diff --git a/mm/compaction.c b/mm/compaction.c index b48c5259ea33..918577595ea8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, { int nr_scanned = 0, total_isolated = 0; struct page *cursor, *valid_page = NULL; - unsigned long nr_strict_required = end_pfn - blockpfn; unsigned long flags; bool locked = false; @@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned++; if (!pfn_valid_within(blockpfn)) - continue; + goto isolate_fail; + if (!valid_page) valid_page = page; if (!PageBuddy(page)) - continue; + goto isolate_fail; /* * The zone lock must be held to isolate freepages. @@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) - continue; + goto isolate_fail; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); - if (!isolated && strict) - break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (isolated) { blockpfn += isolated - 1; cursor += isolated - 1; + continue; } + +isolate_fail: + if (strict) + break; + else + continue; + } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); @@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * pages requested were isolated. If there were any failures, 0 is * returned and CMA will fail. */ - if (strict && nr_strict_required > total_isolated) + if (strict && blockpfn < end_pfn) total_isolated = 0; if (locked) diff --git a/mm/filemap.c b/mm/filemap.c index d56d3c145b9f..7a13f6ac5421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } return ret; diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221a..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -23,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82166bf974e1..1546655a2d78 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1166,8 +1166,10 @@ alloc: } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) + if (ret & VM_FAULT_OOM) { split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } put_page(page); } count_vm_event(THP_FAULT_FALLBACK); @@ -1179,9 +1181,10 @@ alloc: if (page) { split_huge_page(page); put_page(page); - } + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); - ret |= VM_FAULT_OOM; goto out; } @@ -1545,6 +1548,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1557,16 +1561,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) && !pmd_numa(*pmd)) { - entry = *pmd; - entry = pmd_mknuma(entry); + pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(ptl); } @@ -1963,7 +1961,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f0..5b6b0039f725 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1127,8 +1127,8 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css->flags & CSS_ONLINE) && - (next_css == &root->css || css_tryget(next_css))) + if ((next_css == &root->css) || + ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1687,7 +1687,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) * protects memcg_name and makes sure that parallel ooms do not * interleave */ - static DEFINE_SPINLOCK(oom_info_lock); + static DEFINE_MUTEX(oom_info_lock); struct cgroup *task_cgrp; struct cgroup *mem_cgrp; static char memcg_name[PATH_MAX]; @@ -1698,7 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) if (!p) return; - spin_lock(&oom_info_lock); + mutex_lock(&oom_info_lock); rcu_read_lock(); mem_cgrp = memcg->css.cgroup; @@ -1767,7 +1767,7 @@ done: pr_cont("\n"); } - spin_unlock(&oom_info_lock); + mutex_unlock(&oom_info_lock); } /* @@ -6595,6 +6595,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -6611,7 +6612,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) kmem_cgroup_css_offline(memcg); mem_cgroup_invalidate_reclaim_iterators(memcg); - mem_cgroup_reparent_charges(memcg); + + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + css_for_each_descendant_post(iter, css) + mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f08a2d61487..90002ea43638 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -945,8 +945,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * to it. Similarly, page lock is shifted. */ if (hpage != p) { - put_page(hpage); - get_page(p); + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } lock_page(p); unlock_page(hpage); *hpagep = p; @@ -1649,7 +1651,7 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); + struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/memory.c b/mm/memory.c index be6a0c0d4ae0..22dfa617bddb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3348,6 +3348,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); ret = VM_FAULT_HWPOISON; + page_cache_release(vmf.page); goto uncharge_out; } @@ -3703,7 +3704,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3741,20 +3741,13 @@ retry: if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - /* - * If COW results in an oom, the huge pmd will - * have been split, so retry the fault on the - * pte for a smaller charge. - */ - if (unlikely(ret & VM_FAULT_OOM)) - goto retry; - return ret; + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); + return 0; } - - return 0; } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d4..4755c8576942 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1556,10 +1556,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1586,8 +1586,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1609,9 +1609,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - - goto out; - } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 482a33d89134..bed48809e5d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,6 +178,37 @@ out: } /* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + +/* * Get rid of all migration entries and replace them by * references to the indicated page. */ @@ -186,6 +217,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); @@ -1158,7 +1190,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, pm->node); else return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1544,9 +1576,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; @@ -1747,7 +1779,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 8a8cd0265e52..f802c2d216a7 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm) tsk->mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif if (active_mm != mm) mmdrop(active_mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c1785744..769a67a15803 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,36 +58,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; - ptent = *pte; page = vm_normal_page(vma, addr, oldpte); if (page && !PageKsm(page)) { if (!pte_numa(oldpte)) { - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, pte, ptent); + ptep_set_numa(mm, addr, pte); updated = true; } } } - - /* - * Avoid taking write faults for pages we know to be - * dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d30e2cfe804..7106cb1aca8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3758a09a009..3bac76ae4b30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; +} +#else +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return false; +} #endif /* @@ -1572,7 +1583,13 @@ again: get_pageblock_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + /* + * NOTE: GFP_THISNODE allocations do not partake in the kswapd + * aging protocol, so they can't be fair. + */ + if (!gfp_thisnode_allocation(gfp_flags)) + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1944,8 +1961,12 @@ zonelist_scan: * ultimately fall back to remote zones that do not * partake in the fairness round-robin cycle of this * zonelist. + * + * NOTE: GFP_THISNODE allocations do not partake in + * the kswapd aging protocol, so they can't be fair. */ - if (alloc_flags & ALLOC_WMARK_LOW) { + if ((alloc_flags & ALLOC_WMARK_LOW) && + !gfp_thisnode_allocation(gfp_mask)) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; if (!zone_local(preferred_zone, zone)) @@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (gfp_thisnode_allocation(gfp_mask)) goto nopage; restart: diff --git a/mm/percpu.c b/mm/percpu.c index 036cfe07050f..63e24fb4387b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -102,10 +102,11 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ void *data; /* chunk data */ + int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - if (chunk->map_alloc >= chunk->map_used + 2) + if (chunk->map_alloc >= chunk->map_used + 3) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) + while (new_alloc < chunk->map_used + 3) new_alloc *= 2; return new_alloc; @@ -418,48 +419,6 @@ out_unlock: } /** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * @chunk->map must have enough free slots to accommodate the split. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_split_block(struct pcpu_chunk *chunk, int i, - int head, int tail) -{ - int nr_extra = !!head + !!tail; - - BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - - /* insert new subblocks */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } -} - -/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes @@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; + bool seen_free = false; + int *p; - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; /* extra for alignment requirement */ head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); + this_size = (p[1] & ~1) - off; + if (this_size < head + size) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); continue; } @@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; head = 0; } /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) + tail = this_size - head - size; + if (tail < sizeof(int)) { tail = 0; + size = this_size - head; + } /* split if warranted */ if (head || tail) { - pcpu_split_block(chunk, i, head, tail); + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); } + if (!seen_free) + chunk->first_free = i + 1; + /* update hint and mark allocated */ - if (is_last) + if (i + 1 == chunk->map_used) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; + chunk->free_size -= size; + *p |= 1; pcpu_chunk_relocate(chunk, oslot); return off; @@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for <given offset, in use> pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; + if (i < chunk->first_free) + chunk->first_free = i; + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); + if (i > 0 && !(p[-1] & 1)) { + to_free++; i--; + p--; } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); } - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } @@ -617,7 +615,9 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; @@ -713,6 +713,16 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) unsigned long flags; void __percpu *ptr; + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + if (unlikely(size & 1)) + size++; + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -1343,9 +1353,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -ai->static_size; + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; if (schunk->free_size) - schunk->map[schunk->map_used++] = schunk->free_size; + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1358,8 +1372,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; - dchunk->map[dchunk->map_used++] = dchunk->free_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; } /* link the first chunk in */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d0433509..3c5cf68566ec 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -456,25 +456,23 @@ free_iovecs: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); diff --git a/mm/rmap.c b/mm/rmap.c index d9d42316a99a..11cf322f8133 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1165,6 +1165,16 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; @@ -1360,8 +1370,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, } static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, struct vm_area_struct *vma) + struct address_space *mapping, void *arg) { + struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1663,7 +1674,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) if (list_empty(&mapping->i_mmap_nonlinear)) goto done; - ret = rwc->file_nonlinear(page, mapping, vma); + ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/slab.c b/mm/slab.c index eb043bf05f4c..b264214c77ea 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1946,7 +1946,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed + * @page: page pointer being destroyed * * Destroy all the objs in a slab, and release the mem back to the system. * Before calling the slab must have been unlinked from the cache. The diff --git a/mm/slub.c b/mm/slub.c index 2b1a6970e46f..25f14ad8f817 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1000,8 +1000,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held. */ static void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -1009,17 +1007,16 @@ static void add_full(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_add(&page->lru, &n->full); } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_del(&page->lru); } @@ -1265,7 +1262,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1519,11 +1517,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page) /* * Management of partially allocated slabs. - * - * list_lock must be held. */ -static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) @@ -1532,23 +1528,32 @@ static inline void add_partial(struct kmem_cache_node *n, list_add(&page->lru, &n->partial); } -/* - * list_lock must be held. - */ -static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. * * Returns a list of objects or NULL if it fails. - * - * Must hold list_lock since we modify the partial list. */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, @@ -1558,6 +1563,8 @@ static inline void *acquire_slab(struct kmem_cache *s, unsigned long counters; struct page new; + lockdep_assert_held(&n->list_lock); + /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the @@ -1902,7 +1909,7 @@ redo: else if (l == M_FULL) - remove_full(s, page); + remove_full(s, n, page); if (m == M_PARTIAL) { @@ -2556,7 +2563,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, new.inuse--; if ((!new.inuse || !prior) && !was_frozen) { - if (kmem_cache_has_cpu_partial(s) && !prior) + if (kmem_cache_has_cpu_partial(s) && !prior) { /* * Slab was on no list before and will be @@ -2566,7 +2573,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ new.frozen = 1; - else { /* Needs to be taken off a list */ + } else { /* Needs to be taken off a list */ n = get_node(s, page_to_nid(page)); /* @@ -2615,7 +2622,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { if (kmem_cache_debug(s)) - remove_full(s, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -2629,9 +2636,10 @@ slab_empty: */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } else + } else { /* Slab must be on the full list */ - remove_full(s, page); + remove_full(s, n, page); + } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); @@ -2905,7 +2913,11 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, DEACTIVATE_TO_HEAD); + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -3191,7 +3203,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - remove_partial(n, page); + __remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -4314,7 +4326,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, page = ACCESS_ONCE(c->partial); if (page) { - x = page->pobjects; + node = page_to_nid(page); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); + else + x = page->pages; total += x; nodes[node] += x; } @@ -5178,7 +5196,7 @@ static int sysfs_slab_add(struct kmem_cache *s) } s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) { kobject_put(&s->kobj); return err; diff --git a/mm/swap.c b/mm/swap.c index b31ba67d440a..0092097b3f4c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,7 +98,7 @@ static void put_compound_page(struct page *page) } /* __split_huge_page_refcount can run under us */ - page_head = compound_trans_head(page); + page_head = compound_head(page); /* * THP can not break up slab pages so avoid taking @@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page) */ unsigned long flags; bool got; - struct page *page_head = compound_trans_head(page); + struct page *page_head = compound_head(page); /* Ref to put_compound_page() comment. */ if (!__compound_tail_refcounted(page_head)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index c6c13b050a58..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; - p->flags = 0; frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 196970a4541f..d4042e75f7c7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> +#include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; |