diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 37 | ||||
-rw-r--r-- | mm/kmemleak.c | 61 | ||||
-rw-r--r-- | mm/kmsan/instrumentation.c | 1 | ||||
-rw-r--r-- | mm/kmsan/shadow.c | 1 | ||||
-rw-r--r-- | mm/madvise.c | 12 | ||||
-rw-r--r-- | mm/memory-tiers.c | 8 | ||||
-rw-r--r-- | mm/mempolicy.c | 17 | ||||
-rw-r--r-- | mm/migrate.c | 7 | ||||
-rw-r--r-- | mm/mmap.c | 23 | ||||
-rw-r--r-- | mm/page_alloc.c | 21 | ||||
-rw-r--r-- | mm/page_isolation.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 17 | ||||
-rw-r--r-- | mm/userfaultfd.c | 25 | ||||
-rw-r--r-- | mm/zsmalloc.c | 3 |
15 files changed, 183 insertions, 63 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cc4a5f4791e..561a42567477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2455,7 +2455,16 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; - page_tail->private = 0; + + /* + * page->private should not be set in tail pages with the exception + * of swap cache pages that store the swp_entry_t in tail pages. + * Fix up and warn once if private is unexpectedly set. + */ + if (!folio_test_swapcache(page_folio(head))) { + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + page_tail->private = 0; + } /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b586cdd75930..546df97c31e4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1014,15 +1014,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. - * - For shared mappings this is a per-vma semaphore that may be - * allocated in a subsequent call to hugetlb_vm_op_open. */ - vma->vm_private_data = (void *)0; - if (!(vma->vm_flags & VM_MAYSHARE)) - return; + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; } /* @@ -2924,11 +2932,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); /* Fall through */ @@ -4601,6 +4609,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); /* + * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA @@ -4615,11 +4624,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) /* * vma_lock structure for sharable mappings is vma specific. - * Clear old pointer (if copied via vm_area_dup) and create new. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { - vma->vm_private_data = NULL; - hugetlb_vma_lock_alloc(vma); + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); } } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1461,6 +1461,27 @@ static void scan_gray_list(void) } /* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,8 +1605,16 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -1632,8 +1647,16 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include <linux/gfp.h> +#include <linux/kmsan_string.h> #include <linux/mm.h> #include <linux/uaccess.h> diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a937eaec5b68..61aa9aedb728 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,17 +787,22 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_find_rev(&mas, 0); - if (prev && (start < prev->vm_end)) - vma = prev; - else - vma = mas_next(&mas, end - 1); + prev = mas_prev(&mas, 0); + if (unlikely(!prev)) + mas_set(&mas, start); + + vma = mas_find(&mas, end - 1); + if (WARN_ON(!vma)) + return 0; + + if (start > vma->vm_start) + prev = vma; for (; vma; vma = mas_next(&mas, end - 1)) { unsigned long vmstart = max(start, vma->vm_start); diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); diff --git a/mm/mmap.c b/mm/mmap.c index bf2122af94e7..2def55555e05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -618,7 +618,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; @@ -2625,14 +2626,14 @@ cannot_expand: if (error) goto unmap_and_free_vma; - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM + /* + * Expansion is handled above, merging is handled below. + * Drivers should not alter the address of the VMA. */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; + if (WARN_ON((addr != vma->vm_start))) { + error = -EINVAL; + goto close_and_free_vma; + } mas_reset(&mas); /* @@ -2654,7 +2655,6 @@ cannot_expand: vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } @@ -2681,7 +2681,7 @@ cannot_expand: if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) - goto unmap_and_free_vma; + goto close_and_free_vma; else goto free_vma; } @@ -2852,6 +2852,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e20ade858e71..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) @@ -5784,14 +5785,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); - - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; + + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); } return (void *)addr; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..3d0fef3980b3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 525758713a55..d03941cace2c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2311,6 +2311,9 @@ void zs_destroy_pool(struct zs_pool *pool) int fg; struct size_class *class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) continue; |