From 80b1d8fdfad1f3084450afa6e2efcdcce867d4af Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 19 Dec 2022 12:36:59 +0000 Subject: mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node() This function sets __GFP_NOWARN in the gfp_mask rendering the warn_alloc() invocations no-ops. Remove this and instead rely on this flag being set only for the vm_area_alloc_pages() function, ensuring it is cleared for each of the warn_alloc() calls. Link: https://lkml.kernel.org/r/20221219123659.90614-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..10fe83c24436 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3031,7 +3031,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); - gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; -- cgit v1.2.1 From edd898181e2f6f0969c08e1dfe2b7cdf902b9b33 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:20 +0100 Subject: mm: vmalloc: avoid calling __find_vmap_area() twice in __vunmap() Currently the __vunmap() path calls __find_vmap_area() twice. Once on entry to check that the area exists, then inside the remove_vm_area() function which also performs a new search for the VA. In order to improvie it from a performance point of view we split remove_vm_area() into two new parts: - find_unlink_vmap_area() that does a search and unlink from tree; - __remove_vm_area() that removes without searching. In this case there is no any functional change for remove_vm_area() whereas vm_remove_mappings(), where a second search happens, switches to the __remove_vm_area() variant where the already detached VA is passed as a parameter, so there is no need to find it again. Performance wise, i use test_vmalloc.sh with 32 threads doing alloc free on a 64-CPUs-x86_64-box: perf without this patch: - 31.41% 0.50% vmalloc_test/10 [kernel.vmlinux] [k] __vunmap - 30.92% __vunmap - 17.67% _raw_spin_lock native_queued_spin_lock_slowpath - 12.33% remove_vm_area - 11.79% free_vmap_area_noflush - 11.18% _raw_spin_lock native_queued_spin_lock_slowpath 0.76% free_unref_page perf with this patch: - 11.35% 0.13% vmalloc_test/14 [kernel.vmlinux] [k] __vunmap - 11.23% __vunmap - 8.28% find_unlink_vmap_area - 7.95% _raw_spin_lock 7.44% native_queued_spin_lock_slowpath - 1.93% free_vmap_area_noflush - 0.56% _raw_spin_lock 0.53% native_queued_spin_lock_slowpath 0.60% __vunmap_range_noflush __vunmap() consumes around ~20% less CPU cycles on this test. Also, switch from find_vmap_area() to find_unlink_vmap_area() to prevent a double access to the vmap_area_lock: one for finding area, second time is for unlinking from a tree. [urezki@gmail.com: switch to find_unlink_vmap_area() in vm_unmap_ram()] Link: https://lkml.kernel.org/r/20221222190022.134380-2-urezki@gmail.com Link: https://lkml.kernel.org/r/20221222190022.134380-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reported-by: Roman Gushchin Reviewed-by: Lorenzo Stoakes Cc: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 79 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 31 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 10fe83c24436..476ccbffb208 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1815,9 +1815,9 @@ static void drain_vmap_area_work(struct work_struct *work) } /* - * Free a vmap area, caller ensuring that the area has been unmapped - * and flush_cache_vunmap had been called for the correct range - * previously. + * Free a vmap area, caller ensuring that the area has been unmapped, + * unlinked and flush_cache_vunmap had been called for the correct + * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { @@ -1825,9 +1825,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) unsigned long va_start = va->va_start; unsigned long nr_lazy; - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + if (WARN_ON_ONCE(!list_empty(&va->list))) + return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -1871,6 +1870,19 @@ struct vmap_area *find_vmap_area(unsigned long addr) return va; } +static struct vmap_area *find_unlink_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr, &vmap_area_root); + if (va) + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + return va; +} + /*** Per cpu kva allocator ***/ /* @@ -2015,6 +2027,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); + spin_lock(&vmap_area_lock); + unlink_va(vb->va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } @@ -2236,7 +2252,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) return; } - va = find_vmap_area(addr); + va = find_unlink_vmap_area(addr); BUG_ON(!va); debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); @@ -2591,6 +2607,20 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } +static struct vm_struct *__remove_vm_area(struct vmap_area *va) +{ + struct vm_struct *vm; + + if (!va || !va->vm) + return NULL; + + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + + return vm; +} + /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2603,26 +2633,10 @@ struct vm_struct *find_vm_area(const void *addr) */ struct vm_struct *remove_vm_area(const void *addr) { - struct vmap_area *va; - might_sleep(); - spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr, &vmap_area_root); - if (va && va->vm) { - struct vm_struct *vm = va->vm; - - va->vm = NULL; - spin_unlock(&vmap_area_lock); - - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; - } - - spin_unlock(&vmap_area_lock); - return NULL; + return __remove_vm_area( + find_unlink_vmap_area((unsigned long) addr)); } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2636,16 +2650,17 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ +static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) { + struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - remove_vm_area(area->addr); + __remove_vm_area(va); /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) @@ -2690,6 +2705,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + struct vmap_area *va; if (!addr) return; @@ -2698,19 +2714,20 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = find_vm_area(addr); - if (unlikely(!area)) { + va = find_unlink_vmap_area((unsigned long)addr); + if (unlikely(!va)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } + area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); + va_remove_mappings(va, deallocate_pages); if (deallocate_pages) { int i; -- cgit v1.2.1 From 14687619e1122d71b2ed70e1afa6bc352e629e85 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 22 Dec 2022 20:00:22 +0100 Subject: mm: vmalloc: replace BUG_ON() by WARN_ON_ONCE() Currently a vm_unmap_ram() functions triggers a BUG() if an area is not found. Replace it by the WARN_ON_ONCE() error message and keep machine alive instead of stopping it. The worst case is a memory leaking. Link: https://lkml.kernel.org/r/20221222190022.134380-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Reviewed-by: Christoph Hellwig Cc: Baoquan He Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 476ccbffb208..428e0bee5c9c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2253,7 +2253,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) } va = find_unlink_vmap_area(addr); - BUG_ON(!va); + if (WARN_ON_ONCE(!va)) + return; + debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); -- cgit v1.2.1 From 37f3605e5e7af7de12aeb670c5b94e5a3c8dbf74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:42 +0100 Subject: mm: reject vmap with VM_FLUSH_RESET_PERMS Patch series "cleanup vfree and vunmap". This little series untangles the vfree and vunmap code path a bit. This patch (of 10): VM_FLUSH_RESET_PERMS is just for use with vmalloc as it is tied to freeing the underlying pages. Link: https://lkml.kernel.org/r/20230121071051.1143058-1-hch@lst.de Link: https://lkml.kernel.org/r/20230121071051.1143058-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 428e0bee5c9c..3f3cb4875966 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2868,6 +2868,9 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) + return NULL; + /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. -- cgit v1.2.1 From f41f036b804d0d920f9b6fd3fca9489dd7afd358 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:43 +0100 Subject: mm: remove __vfree __vfree is a subset of vfree that just skips a few checks, and which is only used by vfree and an error cleanup path. Fold __vfree into vfree and switch the only other caller to call vfree() instead. Link: https://lkml.kernel.org/r/20230121071051.1143058-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3f3cb4875966..67fc9d7e4024 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2786,14 +2786,6 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } -static void __vfree(const void *addr) -{ - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); -} - /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -2821,8 +2813,10 @@ void vfree(const void *addr) if (!addr) return; - - __vfree(addr); + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -3089,7 +3083,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* * If not enough pages were obtained to accomplish an - * allocation request, free them via __vfree() if any. + * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, @@ -3129,7 +3123,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - __vfree(area->addr); + vfree(area->addr); return NULL; } -- cgit v1.2.1 From 01e2e8394a527644de5192f92f64e1c883a3e493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:44 +0100 Subject: mm: remove __vfree_deferred Fold __vfree_deferred into vfree_atomic, and call vfree_atomic early on from vfree if called from interrupt context so that the extra low-level helper can be avoided. Link: https://lkml.kernel.org/r/20230121071051.1143058-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 67fc9d7e4024..cfd796570e61 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2754,20 +2754,6 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } -static inline void __vfree_deferred(const void *addr) -{ - /* - * Use raw_cpu_ptr() because this can be called from preemptible - * context. Preemption is absolutely fine here, because the llist_add() - * implementation is lockless, so it works even if we are adding to - * another cpu's list. schedule_work() should be fine with this too. - */ - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); - - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); -} - /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -2777,13 +2763,19 @@ static inline void __vfree_deferred(const void *addr) */ void vfree_atomic(const void *addr) { - BUG_ON(in_nmi()); + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + BUG_ON(in_nmi()); kmemleak_free(addr); - if (!addr) - return; - __vfree_deferred(addr); + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + if (addr && llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } /** @@ -2805,17 +2797,16 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { - BUG_ON(in_nmi()); + if (unlikely(in_interrupt())) { + vfree_atomic(addr); + return; + } + BUG_ON(in_nmi()); kmemleak_free(addr); + might_sleep(); - might_sleep_if(!in_interrupt()); - - if (!addr) - return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else + if (addr) __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); -- cgit v1.2.1 From 208162f42f958b37147d3c1c5f947c7c1a8b9c41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:45 +0100 Subject: mm: move vmalloc_init and free_work down in vmalloc.c Move these two functions around a bit to avoid forward declarations. Link: https://lkml.kernel.org/r/20230121071051.1143058-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 104 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 53 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cfd796570e61..333228c652d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,17 +89,6 @@ struct vfree_deferred { }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); -static void __vunmap(const void *, int); - -static void free_work(struct work_struct *w) -{ - struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *t, *llnode; - - llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); -} - /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -2434,48 +2423,6 @@ static void vmap_init_free_space(void) } } -void __init vmalloc_init(void) -{ - struct vmap_area *va; - struct vm_struct *tmp; - int i; - - /* - * Create the cache for vmap_area objects. - */ - vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); - - for_each_possible_cpu(i) { - struct vmap_block_queue *vbq; - struct vfree_deferred *p; - - vbq = &per_cpu(vmap_block_queue, i); - spin_lock_init(&vbq->lock); - INIT_LIST_HEAD(&vbq->free); - p = &per_cpu(vfree_deferred, i); - init_llist_head(&p->list); - INIT_WORK(&p->wq, free_work); - } - - /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (WARN_ON_ONCE(!va)) - continue; - - va->va_start = (unsigned long)tmp->addr; - va->va_end = va->va_start + tmp->size; - va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - } - - /* - * Now we can initialize a free vmap space. - */ - vmap_init_free_space(); - vmap_initialized = true; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2754,6 +2701,15 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); } +static void delayed_vfree_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); +} + /** * vfree_atomic - release memory allocated by vmalloc() * @addr: memory base address @@ -4209,3 +4165,45 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, delayed_vfree_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} -- cgit v1.2.1 From 5d3d31d6fb17a8eb83af50ea8a0616a3cfde3e58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:46 +0100 Subject: mm: call vfree instead of __vunmap from delayed_vfree_work This adds an extra, never taken, in_interrupt() branch, but will allow to cut down the maze of vfree helpers. Link: https://lkml.kernel.org/r/20230121071051.1143058-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 333228c652d1..0c0267b34afa 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2707,7 +2707,7 @@ static void delayed_vfree_work(struct work_struct *w) struct llist_node *t, *llnode; llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); + vfree(llnode); } /** -- cgit v1.2.1 From 39e65b7f63392d70f2f6aff5f4c5c3262f49637e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:47 +0100 Subject: mm: move __remove_vm_area out of va_remove_mappings __remove_vm_area is the only part of va_remove_mappings that requires a vmap_area. Move the call out to the caller and only pass the vm_struct to va_remove_mappings. Link: https://lkml.kernel.org/r/20230121071051.1143058-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0c0267b34afa..003e49d0e628 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2599,18 +2599,15 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ -static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { - struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - __remove_vm_area(va); - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ if (!flush_reset) return; @@ -2676,7 +2673,8 @@ static void __vunmap(const void *addr, int deallocate_pages) kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - va_remove_mappings(va, deallocate_pages); + __remove_vm_area(va); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { int i; -- cgit v1.2.1 From 75c59ce74e47d3e11aa7666f1877aa64495f7b03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:48 +0100 Subject: mm: use remove_vm_area in __vunmap Use the common helper to find and remove a vmap_area instead of open coding it. Link: https://lkml.kernel.org/r/20230121071051.1143058-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 003e49d0e628..bc6791a0adbe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2556,20 +2556,6 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } -static struct vm_struct *__remove_vm_area(struct vmap_area *va) -{ - struct vm_struct *vm; - - if (!va || !va->vm) - return NULL; - - vm = va->vm; - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; -} - /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2582,10 +2568,18 @@ static struct vm_struct *__remove_vm_area(struct vmap_area *va) */ struct vm_struct *remove_vm_area(const void *addr) { + struct vmap_area *va; + struct vm_struct *vm; + might_sleep(); - return __remove_vm_area( - find_unlink_vmap_area((unsigned long) addr)); + va = find_unlink_vmap_area((unsigned long)addr); + if (!va || !va->vm) + return NULL; + vm = va->vm; + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2651,7 +2645,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; - struct vmap_area *va; if (!addr) return; @@ -2660,20 +2653,18 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - va = find_unlink_vmap_area((unsigned long)addr); - if (unlikely(!va)) { + area = remove_vm_area(addr); + if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } - area = va->vm; debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - __remove_vm_area(va); vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { -- cgit v1.2.1 From 17d3ef432dcbe80c134f1f79e2ed1ebd1076eab1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:49 +0100 Subject: mm: move debug checks from __vunmap to remove_vm_area All these checks apply to the free_vm_area interface as well, so move them to the common routine. Link: https://lkml.kernel.org/r/20230121071051.1143058-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bc6791a0adbe..cb8c8cd161c8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2573,11 +2573,20 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return NULL; + va = find_unlink_vmap_area((unsigned long)addr); if (!va || !va->vm) return NULL; vm = va->vm; + + debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); + debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); kasan_free_module_shadow(vm); + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); + free_unmap_vmap_area(va); return vm; } @@ -2649,10 +2658,6 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)) - return; - area = remove_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", @@ -2660,11 +2665,6 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); - debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - - kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { -- cgit v1.2.1 From 79311c1fe0175941298fb362ba072514e2fe5c54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:50 +0100 Subject: mm: split __vunmap vunmap only needs to find and free the vmap_area and vm_strut, so open code that there and merge the rest of the code into vfree. Link: https://lkml.kernel.org/r/20230121071051.1143058-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 84 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 43 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb8c8cd161c8..11144a29665a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2651,45 +2651,6 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } -static void __vunmap(const void *addr, int deallocate_pages) -{ - struct vm_struct *area; - - if (!addr) - return; - - area = remove_vm_area(addr); - if (unlikely(!area)) { - WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", - addr); - return; - } - - vm_remove_mappings(area, deallocate_pages); - - if (deallocate_pages) { - int i; - - for (i = 0; i < area->nr_pages; i++) { - struct page *page = area->pages[i]; - - BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); - cond_resched(); - } - atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); - - kvfree(area->pages); - } - - kfree(area); -} - static void delayed_vfree_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); @@ -2742,6 +2703,9 @@ void vfree_atomic(const void *addr) */ void vfree(const void *addr) { + struct vm_struct *vm; + int i; + if (unlikely(in_interrupt())) { vfree_atomic(addr); return; @@ -2751,8 +2715,32 @@ void vfree(const void *addr) kmemleak_free(addr); might_sleep(); - if (addr) - __vunmap(addr, 1); + if (!addr) + return; + + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + vm_remove_mappings(vm, true); + for (i = 0; i < vm->nr_pages; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + kvfree(vm->pages); + kfree(vm); } EXPORT_SYMBOL(vfree); @@ -2767,10 +2755,20 @@ EXPORT_SYMBOL(vfree); */ void vunmap(const void *addr) { + struct vm_struct *vm; + BUG_ON(in_interrupt()); might_sleep(); - if (addr) - __vunmap(addr, 0); + + if (!addr) + return; + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", + addr); + return; + } + kfree(vm); } EXPORT_SYMBOL(vunmap); -- cgit v1.2.1 From 9e5fa0ae52fc67dea86f95ea4e3909b3e10a160f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 21 Jan 2023 08:10:51 +0100 Subject: mm: refactor va_remove_mappings Move the VM_FLUSH_RESET_PERMS to the caller and rename the function to better describe what it is doing. Link: https://lkml.kernel.org/r/20230121071051.1143058-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/vmalloc.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 11144a29665a..9b71ec3213cb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2602,35 +2602,23 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* + * Flush the vm mapping and reset the direct map. + */ +static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); - int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ - if (!flush_reset) - return; - - /* - * If not deallocating pages, just do the flush of the VM area and - * return. - */ - if (!deallocate_pages) { - vm_unmap_aliases(); - return; - } - /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2725,7 +2713,8 @@ void vfree(const void *addr) return; } - vm_remove_mappings(vm, true); + if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) + vm_reset_perms(vm); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; -- cgit v1.2.1 From 1c71222e5f2393b5ea1a41795c67589eea7e3490 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jan 2023 11:37:49 -0800 Subject: mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Mike Rapoport (IBM) Acked-by: Sebastian Reichel Reviewed-by: Liam R. Howlett Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9b71ec3213cb..ff4d7dfdf84a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3596,7 +3596,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } -- cgit v1.2.1 From 7e4a32c0e8adafdda6161635c9046e6c1e8b95b5 Mon Sep 17 00:00:00 2001 From: Hyunmin Lee Date: Wed, 1 Feb 2023 20:51:42 +0900 Subject: mm/vmalloc: replace BUG_ON with a simple if statement As per the coding standards, in the event of an abnormal condition that should not occur under normal circumstances, the kernel should attempt recovery and proceed with execution, rather than halting the machine. Specifically, in the alloc_vmap_area() function, use a simple if() instead of using BUG_ON() halting the machine. Link: https://lkml.kernel.org/r/20230201115142.GA7772@min-iamroot Co-developed-by: Gwan-gyeong Mun Signed-off-by: Gwan-gyeong Mun Co-developed-by: Jeungwoo Yoo Signed-off-by: Jeungwoo Yoo Co-developed-by: Sangyun Kim Signed-off-by: Sangyun Kim Signed-off-by: Hyunmin Lee Reviewed-by: Christophe Leroy Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ff4d7dfdf84a..1dd7ca258a76 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1586,9 +1586,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int purged = 0; int ret; - BUG_ON(!size); - BUG_ON(offset_in_page(size)); - BUG_ON(!is_power_of_2(align)); + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); -- cgit v1.2.1 From d76f99548cf0474c3bf75f25fab1778e03ade5f2 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:14 +0800 Subject: mm/vmalloc.c: add used_map into vmap_block to track space of vmap_block Patch series "mm/vmalloc.c: allow vread() to read out vm_map_ram areas", v5. Problem: *** Stephen reported vread() will skip vm_map_ram areas when reading out /proc/kcore with drgn utility. Please see below link to get more details. /proc/kcore reads 0's for vmap_block https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u Root cause: *** The normal vmalloc API uses struct vmap_area to manage the virtual kernel area allocated, and associate a vm_struct to store more information and pass out. However, area reserved through vm_map_ram() interface doesn't allocate vm_struct to associate with. So the current code in vread() will skip the vm_map_ram area through 'if (!va->vm)' conditional checking. Solution: *** To mark the area reserved through vm_map_ram() interface, add field 'flags' into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). And also add bitmap field 'used_map' into struct vmap_block to mark those further subdivided regions being used to differentiate with dirty and free regions in vmap_block. With the help of above vmap_area->flags and vmap_block->used_map, we can recognize and handle vm_map_ram areas successfully. All these are done in patch 1~3. Meanwhile, do some improvement on areas related to vm_map_ram areas in patch 4, 5. And also change area flag from VM_ALLOC to VM_IOREMAP in patch 6, 7 because this will show them as 'ioremap' in /proc/vmallocinfo, and exclude them from /proc/kcore. This patch (of 7): In one vmap_block area, there could be three types of regions: region being used which is allocated through vb_alloc(), dirty region which is freed via vb_free() and free region. Among them, only used region has available data. While there's no way to track those used regions currently. Here, add bitmap field used_map into vmap_block, and set/clear it during allocation or freeing regions of vmap_block area. This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20230206084020.174506-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1dd7ca258a76..0f1396d6fbe6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1910,6 +1910,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; @@ -1986,10 +1987,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; + bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -2099,6 +2102,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; + bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -2132,6 +2136,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); -- cgit v1.2.1 From 869176a096068056b338b5cc1b0af93106007f5d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:15 +0800 Subject: mm/vmalloc.c: add flags to mark vm_map_ram area Through vmalloc API, a virtual kernel area is reserved for physical address mapping. And vmap_area is used to track them, while vm_struct is allocated to associate with the vmap_area to store more information and passed out. However, area reserved via vm_map_ram() is an exception. It doesn't have vm_struct to associate with vmap_area. And we can't recognize the vmap_area with '->vm == NULL' as a vm_map_ram() area because the normal freeing path will set va->vm = NULL before unmapping, please see function remove_vm_area(). Meanwhile, there are two kinds of handling for vm_map_ram area. One is the whole vmap_area being reserved and mapped at one time through vm_map_area() interface; the other is the whole vmap_area with VMAP_BLOCK_SIZE size being reserved, while mapped into split regions with smaller size via vb_alloc(). To mark the area reserved through vm_map_ram(), add flags field into struct vmap_area. Bit 0 indicates this is vm_map_ram area created through vm_map_ram() interface, while bit 1 marks out the type of vm_map_ram area which makes use of vmap_block to manage split regions via vb_alloc/free(). This is a preparation for later use. Link: https://lkml.kernel.org/r/20230206084020.174506-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- mm/vmalloc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f1396d6fbe6..2d0960190c42 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1578,7 +1578,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, gfp_t gfp_mask, + unsigned long va_flags) { struct vmap_area *va; unsigned long freed; @@ -1623,6 +1624,7 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; + va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); @@ -1901,6 +1903,10 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1976,7 +1982,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, - node, gfp_mask); + node, gfp_mask, + VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2285,7 +2292,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, + node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; @@ -2483,7 +2491,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; -- cgit v1.2.1 From 06c8994626d1b7d8c26dfd06992d67703a274054 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:16 +0800 Subject: mm/vmalloc.c: allow vread() to read out vm_map_ram areas Currently, vread can read out vmalloc areas which is associated with a vm_struct. While this doesn't work for areas created by vm_map_ram() interface because it doesn't have an associated vm_struct. Then in vread(), these areas are all skipped. Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. The area created with vmap_ram_vread() interface directly can be handled like the other normal vmap areas with aligned_vread(). While areas which will be further subdivided and managed with vmap_block need carefully read out page-aligned small regions and zero fill holes. Link: https://lkml.kernel.org/r/20230206084020.174506-4-bhe@redhat.com Reported-by: Stephen Brennan Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Tested-by: Stephen Brennan Cc: Dan Carpenter Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 7 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d0960190c42..7188a47315c2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3463,6 +3463,68 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + if (!vb) + goto finished; + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + goto finished; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + goto unlock; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } +unlock: + spin_unlock(&vb->lock); + +finished: + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3493,7 +3555,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3514,12 +3576,21 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + /* + * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need + * be set together with VMAP_RAM. + */ + WARN_ON(flags == VMAP_BLOCK); + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3529,10 +3600,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); -- cgit v1.2.1 From bba9697b42ead45687352fdd0fd498735bc4361d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:17 +0800 Subject: mm/vmalloc: explicitly identify vm_map_ram area when shown in /proc/vmcoreinfo Now, by marking VMAP_RAM in vmap_area->flags for vm_map_ram area, we can clearly differentiate it with other vmalloc areas. So identify vm_map_area area by checking VMAP_RAM of vmap_area->flags when shown in /proc/vmcoreinfo. Meanwhile, the code comment above vm_map_ram area checking in s_show() is not needed any more, remove it here. Link: https://lkml.kernel.org/r/20230206084020.174506-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Cc: Dan Carpenter Cc: Stephen Brennan Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7188a47315c2..87d71c783646 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4152,14 +4152,11 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); - /* - * s_show can encounter race with remove_vm_area, !vm on behalf - * of vmap area is being tear down or vm_map_ram allocation. - */ if (!va->vm) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); goto final; } -- cgit v1.2.1 From 30a7a9b17c4b0331ec67aadb4b30ff2a951b4ed5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 6 Feb 2023 16:40:18 +0800 Subject: mm/vmalloc: skip the uninitilized vmalloc areas For areas allocated via vmalloc_xxx() APIs, it searches for unmapped area to reserve and allocates new pages to map into, please see function __vmalloc_node_range(). During the process, flag VM_UNINITIALIZED is set in vm->flags to indicate that the pages allocation and mapping haven't been done, until clear_vm_uninitialized_flag() is called to clear VM_UNINITIALIZED. For this kind of area, if VM_UNINITIALIZED is still set, let's ignore it in vread() because pages newly allocated and being mapped in that area only contains zero data. reading them out by aligned_vread() is wasting time. Link: https://lkml.kernel.org/r/20230206084020.174506-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Cc: Dan Carpenter Cc: Stephen Brennan Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 87d71c783646..3b57260b6d39 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3587,6 +3587,11 @@ long vread(char *buf, char *addr, unsigned long count) if (!vm && !flags) continue; + if (vm && (vm->flags & VM_UNINITIALIZED)) + continue; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + vaddr = (char *) va->va_start; size = vm ? get_vm_area_size(vm) : va_size(va); -- cgit v1.2.1