diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 235 | ||||
-rw-r--r-- | mm/Makefile | 28 | ||||
-rw-r--r-- | mm/backing-dev.c | 424 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 302 | ||||
-rw-r--r-- | mm/bootmem.c | 256 | ||||
-rw-r--r-- | mm/bounce.c | 154 | ||||
-rw-r--r-- | mm/cleancache.c | 279 | ||||
-rw-r--r-- | mm/compaction.c | 947 | ||||
-rw-r--r-- | mm/dmapool.c | 55 | ||||
-rw-r--r-- | mm/early_ioremap.c | 245 | ||||
-rw-r--r-- | mm/fadvise.c | 86 | ||||
-rw-r--r-- | mm/filemap.c | 1239 | ||||
-rw-r--r-- | mm/filemap_xip.c | 19 | ||||
-rw-r--r-- | mm/fremap.c | 106 | ||||
-rw-r--r-- | mm/frontswap.c | 460 | ||||
-rw-r--r-- | mm/highmem.c | 40 | ||||
-rw-r--r-- | mm/huge_memory.c | 1562 | ||||
-rw-r--r-- | mm/hugetlb.c | 1533 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 408 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 13 | ||||
-rw-r--r-- | mm/internal.h | 147 | ||||
-rw-r--r-- | mm/interval_tree.c | 112 | ||||
-rw-r--r-- | mm/iov_iter.c | 224 | ||||
-rw-r--r-- | mm/kmemleak.c | 269 | ||||
-rw-r--r-- | mm/ksm.c | 868 | ||||
-rw-r--r-- | mm/list_lru.c | 152 | ||||
-rw-r--r-- | mm/madvise.c | 191 | ||||
-rw-r--r-- | mm/memblock.c | 763 | ||||
-rw-r--r-- | mm/memcontrol.c | 4391 | ||||
-rw-r--r-- | mm/memory-failure.c | 481 | ||||
-rw-r--r-- | mm/memory.c | 1405 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1244 | ||||
-rw-r--r-- | mm/mempolicy.c | 892 | ||||
-rw-r--r-- | mm/mempool.c | 16 | ||||
-rw-r--r-- | mm/migrate.c | 827 | ||||
-rw-r--r-- | mm/mincore.c | 30 | ||||
-rw-r--r-- | mm/mlock.c | 480 | ||||
-rw-r--r-- | mm/mm_init.c | 81 | ||||
-rw-r--r-- | mm/mmap.c | 1458 | ||||
-rw-r--r-- | mm/mmu_context.c | 6 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 176 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/mprotect.c | 206 | ||||
-rw-r--r-- | mm/mremap.c | 167 | ||||
-rw-r--r-- | mm/nobootmem.c | 236 | ||||
-rw-r--r-- | mm/nommu.c | 276 | ||||
-rw-r--r-- | mm/oom_kill.c | 436 | ||||
-rw-r--r-- | mm/page-writeback.c | 533 | ||||
-rw-r--r-- | mm/page_alloc.c | 2113 | ||||
-rw-r--r-- | mm/page_cgroup.c | 30 | ||||
-rw-r--r-- | mm/page_io.c | 256 | ||||
-rw-r--r-- | mm/page_isolation.c | 166 | ||||
-rw-r--r-- | mm/pagewalk.c | 75 | ||||
-rw-r--r-- | mm/percpu-vm.c | 1 | ||||
-rw-r--r-- | mm/percpu.c | 264 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 94 | ||||
-rw-r--r-- | mm/prio_tree.c | 208 | ||||
-rw-r--r-- | mm/process_vm_access.c | 302 | ||||
-rw-r--r-- | mm/readahead.c | 91 | ||||
-rw-r--r-- | mm/rmap.c | 941 | ||||
-rw-r--r-- | mm/shmem.c | 1273 | ||||
-rw-r--r-- | mm/slab.c | 2521 | ||||
-rw-r--r-- | mm/slab.h | 292 | ||||
-rw-r--r-- | mm/slab_common.c | 748 | ||||
-rw-r--r-- | mm/slob.c | 295 | ||||
-rw-r--r-- | mm/slub.c | 1641 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 33 | ||||
-rw-r--r-- | mm/sparse.c | 368 | ||||
-rw-r--r-- | mm/swap.c | 606 | ||||
-rw-r--r-- | mm/swap_state.c | 172 | ||||
-rw-r--r-- | mm/swapfile.c | 1089 | ||||
-rw-r--r-- | mm/thrash.c | 155 | ||||
-rw-r--r-- | mm/truncate.c | 325 | ||||
-rw-r--r-- | mm/util.c | 200 | ||||
-rw-r--r-- | mm/vmacache.c | 114 | ||||
-rw-r--r-- | mm/vmalloc.c | 541 | ||||
-rw-r--r-- | mm/vmpressure.c | 380 | ||||
-rw-r--r-- | mm/vmscan.c | 2404 | ||||
-rw-r--r-- | mm/vmstat.c | 204 | ||||
-rw-r--r-- | mm/workingset.c | 414 | ||||
-rw-r--r-- | mm/zbud.c | 527 | ||||
-rw-r--r-- | mm/zsmalloc.c | 1117 | ||||
-rw-r--r-- | mm/zswap.c | 940 |
83 files changed, 30900 insertions, 14488 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e338407f1225..1b5a95f0fa01 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1,6 +1,6 @@ config SELECT_MEMORY_MODEL def_bool y - depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL + depends on ARCH_SELECT_MEMORY_MODEL choice prompt "Memory model" @@ -20,7 +20,7 @@ config FLATMEM_MANUAL Some users of more advanced features like NUMA and memory hotplug may have different options here. - DISCONTIGMEM is an more mature, better tested system, + DISCONTIGMEM is a more mature, better tested system, but is incompatible with memory hotplug and may suffer decreased performance over SPARSEMEM. If unsure between "Sparse Memory" and "Discontiguous Memory", choose @@ -140,11 +140,47 @@ config ARCH_DISCARD_MEMBLOCK config NO_BOOTMEM boolean +config MEMORY_ISOLATION + boolean + +config MOVABLE_NODE + boolean "Enable to assign a node which has only movable memory" + depends on HAVE_MEMBLOCK + depends on NO_BOOTMEM + depends on X86_64 + depends on NUMA + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows the following + two things: + - When the system is booting, node full of hotpluggable memory can + be arranged to have only movable memory so that the whole node can + be hot-removed. (need movable_node boot option specified). + - After the system is up, the option allows users to online all the + memory of a node as movable memory so that the whole node can be + hot-removed. + + Users who don't use the memory hotplug feature are fine with this + option on since they don't specify movable_node boot option or they + don't online memory as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. + +# +# Only be set on architectures that have completely implemented memory hotplug +# feature. If you are not sure, don't touch it. +# +config HAVE_BOOTMEM_INFO_NODE + def_bool n + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE @@ -153,6 +189,8 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTREMOVE bool "Allow for memory hot remove" + select MEMORY_ISOLATION + select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION @@ -178,15 +216,34 @@ config PAGEFLAGS_EXTENDED # config SPLIT_PTLOCK_CPUS int + default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 - default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + +# +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && VIRTIO_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + # # support for memory compaction config COMPACTION bool "Allow for memory compaction" + def_bool y select MIGRATION depends on MMU help @@ -198,7 +255,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in @@ -216,8 +273,27 @@ config ZONE_DMA_FLAG default "1" config BOUNCE - def_bool y + bool "Enable bounce buffers" + default y depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + help + Enable bounce buffers for devices that cannot access + the full range of memory available to the CPU. Enabled + by default when ZONE_DMA or HIGHMEM is selected, but you + may say n to override this. + +# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often +# have more than 4GB of memory, but we don't currently use the IOTLB to present +# a 32-bit address to OHCI. So we need to use a bounce pool instead. +# +# We also use the bounce pool to provide stable page writes for jbd. jbd +# initiates buffer writeback without locking the page or setting PG_writeback, +# and fixing that behavior (a second time; jbd2 doesn't have this problem) is +# a major rework effort. Instead, use the bounce buffer to snapshot pages +# (until jbd goes away). The only jbd user is ext3. +config NEED_BOUNCE_POOL + bool + default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) config NR_QUICK int @@ -226,8 +302,12 @@ config NR_QUICK default "1" config VIRT_TO_BUS - def_bool y - depends on !ARCH_NO_VIRT_TO_BUS + bool + help + An architecture should select this if it implements the + deprecated interface virt_to_bus(). All new architectures + should probably not select this. + config MMU_NOTIFIER bool @@ -272,6 +352,7 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" + select MEMORY_ISOLATION help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running @@ -313,7 +394,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on X86 && MMU + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and @@ -349,6 +430,16 @@ choice benefit. endchoice +config CROSS_MEMORY_ATTACH + bool "Cross Memory Support" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to to another process's address space. + See the man page for more details. + # # UP and nommu archs use km based percpu allocator # @@ -379,3 +470,129 @@ config CLEANCACHE in a negligible performance hit. If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. + +config CMA + bool "Contiguous Memory Allocator" + depends on HAVE_MEMBLOCK && MMU + select MIGRATION + select MEMORY_ISOLATION + help + This enables the Contiguous Memory Allocator which allows other + subsystems to allocate big physically-contiguous blocks of memory. + CMA reserves a region of memory and allows only movable pages to + be allocated from it. This way, the kernel can use the memory for + pagecache and when a subsystem requests for contiguous area, the + allocated pages are migrated away to serve the contiguous request. + + If unsure, say "n". + +config CMA_DEBUG + bool "CMA debug messages (DEVELOPMENT)" + depends on DEBUG_KERNEL && CMA + help + Turns on debug messages in CMA. This produces KERN_DEBUG + messages for every CMA call as well as various messages while + processing calls such as dma_alloc_from_contiguous(). + This option does not affect warning and error messages. + +config ZBUD + tristate + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config ZSWAP + bool "Compressed cache for swap pages (EXPERIMENTAL)" + depends on FRONTSWAP && CRYPTO=y + select CRYPTO_LZO + select ZBUD + default n + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster that swap device + reads, can also improve workload performance. + + This is marked experimental because it is a new feature (as of + v3.11) that interacts heavily with memory reclaim. While these + interactions don't cause any known issues on simple memory setups, + they have not be fully explored on the large set of potential + configurations and workloads that exist. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. + +config ZSMALLOC + bool "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config PGTABLE_MAPPING + bool "Use page table mapping to access object in zsmalloc" + depends on ZSMALLOC + help + By default, zsmalloc uses a copy-based object mapping method to + access allocations that span two pages. However, if a particular + architecture (ex, ARM) performs VM mapping faster than copying, + then you should select this. This causes zsmalloc to use page table + mapping rather than copying for object mapping. + + You can check speed with zsmalloc benchmark: + https://github.com/spartacus06/zsmapbench + +config GENERIC_EARLY_IOREMAP + bool + +config MAX_STACK_SIZE_MB + int "Maximum user stack size for 32-bit processes (MB)" + default 80 + range 8 256 if METAG + range 8 2048 + depends on STACK_GROWSUP && (!64BIT || COMPAT) + help + This is the maximum stack size in Megabytes in the VM layout of 32-bit + user processes when the stack grows upwards (currently only on parisc + and metag arch). The stack will be located at the highest memory + address minus the given value, unless the RLIMIT_STACK hard limit is + changed to a smaller value in which case that is used. + + A sane initial value is 80 MB. diff --git a/mm/Makefile b/mm/Makefile index 50ec00ef2a0e..b484452dac57 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,15 +5,21 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o \ - process_vm_access.o + vmalloc.o pagewalk.o pgtable-generic.o + +ifdef CONFIG_CROSS_MEMORY_ATTACH +mmu-$(CONFIG_MMU) += process_vm_access.o +endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o mmu_context.o percpu.o \ - $(mmu-y) + util.o mmzone.o vmstat.o backing-dev.o \ + mm_init.o mmu_context.o percpu.o slab_common.o \ + compaction.o balloon_compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ + iov_iter.o $(mmu-y) + obj-y += init-mm.o ifdef CONFIG_NO_BOOTMEM @@ -25,14 +31,15 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o -obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o @@ -45,9 +52,14 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o +obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o +obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZBUD) += zbud.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dd8e2aafb07e..09d9591b7708 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -31,19 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as - * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side * locking. */ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); -LIST_HEAD(bdi_pending_list); -static struct task_struct *sync_supers_tsk; -static struct timer_list sync_supers_timer; - -static int bdi_sync_supers(void *); -static void sync_supers_timer_fn(unsigned long); +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { @@ -164,16 +159,16 @@ static ssize_t read_ahead_kb_store(struct device *dev, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned long read_ahead_kb; - ssize_t ret = -EINVAL; + ssize_t ret; - read_ahead_kb = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); - ret = count; - } - return ret; + ret = kstrtoul(buf, 10, &read_ahead_kb); + if (ret < 0) + return ret; + + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + + return count; } #define K(pages) ((pages) << (PAGE_SHIFT - 10)) @@ -185,7 +180,8 @@ static ssize_t name##_show(struct device *dev, \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ -} +} \ +static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) @@ -193,16 +189,17 @@ static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned int ratio; - ssize_t ret = -EINVAL; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; - ratio = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - ret = bdi_set_min_ratio(bdi, ratio); - if (!ret) - ret = count; - } return ret; } BDI_SHOW(min_ratio, bdi->min_ratio) @@ -211,28 +208,40 @@ static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); - char *end; unsigned int ratio; - ssize_t ret = -EINVAL; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; - ratio = simple_strtoul(buf, &end, 10); - if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { - ret = bdi_set_max_ratio(bdi, ratio); - if (!ret) - ret = count; - } return ret; } BDI_SHOW(max_ratio, bdi->max_ratio) -#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) +static ssize_t stable_pages_required_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); -static struct device_attribute bdi_dev_attrs[] = { - __ATTR_RW(read_ahead_kb), - __ATTR_RW(min_ratio), - __ATTR_RW(max_ratio), - __ATTR_NULL, + return snprintf(page, PAGE_SIZE-1, "%d\n", + bdi_cap_stable_pages_required(bdi) ? 1 : 0); +} +static DEVICE_ATTR_RO(stable_pages_required); + +static struct attribute *bdi_dev_attrs[] = { + &dev_attr_read_ahead_kb.attr, + &dev_attr_min_ratio.attr, + &dev_attr_max_ratio.attr, + &dev_attr_stable_pages_required.attr, + NULL, }; +ATTRIBUTE_GROUPS(bdi_dev); static __init int bdi_class_init(void) { @@ -240,7 +249,7 @@ static __init int bdi_class_init(void) if (IS_ERR(bdi_class)) return PTR_ERR(bdi_class); - bdi_class->dev_attrs = bdi_dev_attrs; + bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); return 0; } @@ -250,11 +259,10 @@ static int __init default_bdi_init(void) { int err; - sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); - BUG_ON(IS_ERR(sync_supers_tsk)); - - setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); - bdi_arm_supers_timer(); + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!bdi_wq) + return -ENOMEM; err = bdi_init(&default_backing_dev_info); if (!err) @@ -271,66 +279,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) } /* - * kupdated() used to do this. We cannot do it from the bdi_forker_thread() - * or we risk deadlocking on ->s_umount. The longer term solution would be - * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback thread individually. - */ -static int bdi_sync_supers(void *unused) -{ - set_user_nice(current, 0); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* - * Do this periodically, like kupdated() did before. - */ - sync_supers(); - } - - return 0; -} - -void bdi_arm_supers_timer(void) -{ - unsigned long next; - - if (!dirty_writeback_interval) - return; - - next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; - mod_timer(&sync_supers_timer, round_jiffies_up(next)); -} - -static void sync_supers_timer_fn(unsigned long unused) -{ - wake_up_process(sync_supers_tsk); - bdi_arm_supers_timer(); -} - -static void wakeup_timer_fn(unsigned long data) -{ - struct backing_dev_info *bdi = (struct backing_dev_info *)data; - - spin_lock_bh(&bdi->wb_lock); - if (bdi->wb.task) { - trace_writeback_wake_thread(bdi); - wake_up_process(bdi->wb.task); - } else if (bdi->dev) { - /* - * When bdi tasks are inactive for long time, they are killed. - * In this case we have to wake-up the forker thread which - * should create and run the bdi thread. - */ - trace_writeback_wake_forker_thread(bdi); - wake_up_process(default_backing_dev_info.wb.task); - } - spin_unlock_bh(&bdi->wb_lock); -} - -/* * This function is used when the first inode for this bdi is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would @@ -340,182 +288,19 @@ static void wakeup_timer_fn(unsigned long data) * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). */ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); -} - -/* - * Calculate the longest interval (jiffies) bdi threads are allowed to be - * inactive. - */ -static unsigned long bdi_longest_inactive(void) -{ - unsigned long interval; - - interval = msecs_to_jiffies(dirty_writeback_interval * 10); - return max(5UL * 60 * HZ, interval); -} - -/* - * Clear pending bit and wakeup anybody waiting for flusher thread creation or - * shutdown - */ -static void bdi_clear_pending(struct backing_dev_info *bdi) -{ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); -} - -static int bdi_forker_thread(void *ptr) -{ - struct bdi_writeback *me = ptr; - - current->flags |= PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - for (;;) { - struct task_struct *task = NULL; - struct backing_dev_info *bdi; - enum { - NO_ACTION, /* Nothing to do */ - FORK_THREAD, /* Fork bdi thread */ - KILL_THREAD, /* Kill inactive bdi thread */ - } action = NO_ACTION; - - /* - * Temporary measure, we want to make sure we don't see - * dirty data on the default backing_dev_info - */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { - del_timer(&me->wakeup_timer); - wb_do_writeback(me, 0); - } - - spin_lock_bh(&bdi_lock); - /* - * In the following loop we are going to check whether we have - * some work to do without any synchronization with tasks - * waking us up to do work for them. Set the task state here - * so that we don't miss wakeups after verifying conditions. - */ - set_current_state(TASK_INTERRUPTIBLE); - - list_for_each_entry(bdi, &bdi_list, bdi_list) { - bool have_dirty_io; - - if (!bdi_cap_writeback_dirty(bdi) || - bdi_cap_flush_forker(bdi)) - continue; - - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi %p/%s is not registered!\n", bdi, bdi->name); - - have_dirty_io = !list_empty(&bdi->work_list) || - wb_has_dirty_io(&bdi->wb); - - /* - * If the bdi has work to do, but the thread does not - * exist - create it. - */ - if (!bdi->wb.task && have_dirty_io) { - /* - * Set the pending bit - if someone will try to - * unregister this bdi - it'll wait on this bit. - */ - set_bit(BDI_pending, &bdi->state); - action = FORK_THREAD; - break; - } - - spin_lock(&bdi->wb_lock); - - /* - * If there is no work to do and the bdi thread was - * inactive long enough - kill it. The wb_lock is taken - * to make sure no-one adds more work to this bdi and - * wakes the bdi thread up. - */ - if (bdi->wb.task && !have_dirty_io && - time_after(jiffies, bdi->wb.last_active + - bdi_longest_inactive())) { - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock(&bdi->wb_lock); - set_bit(BDI_pending, &bdi->state); - action = KILL_THREAD; - break; - } - spin_unlock(&bdi->wb_lock); - } - spin_unlock_bh(&bdi_lock); - - /* Keep working if default bdi still has things to do */ - if (!list_empty(&me->bdi->work_list)) - __set_current_state(TASK_RUNNING); - - switch (action) { - case FORK_THREAD: - __set_current_state(TASK_RUNNING); - task = kthread_create(bdi_writeback_thread, &bdi->wb, - "flush-%s", dev_name(bdi->dev)); - if (IS_ERR(task)) { - /* - * If thread creation fails, force writeout of - * the bdi from the thread. Hopefully 1024 is - * large enough for efficient IO. - */ - writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); - } else { - /* - * The spinlock makes sure we do not lose - * wake-ups when racing with 'bdi_queue_work()'. - * And as soon as the bdi thread is visible, we - * can start it. - */ - spin_lock_bh(&bdi->wb_lock); - bdi->wb.task = task; - spin_unlock_bh(&bdi->wb_lock); - wake_up_process(task); - } - bdi_clear_pending(bdi); - break; - - case KILL_THREAD: - __set_current_state(TASK_RUNNING); - kthread_stop(task); - bdi_clear_pending(bdi); - break; - - case NO_ACTION: - if (!wb_has_dirty_io(me) || !dirty_writeback_interval) - /* - * There are no dirty data. The only thing we - * should now care about is checking for - * inactive bdi threads and killing them. Thus, - * let's sleep for longer time, save energy and - * be friendly for battery-driven devices. - */ - schedule_timeout(bdi_longest_inactive()); - else - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - try_to_freeze(); - break; - } - } - - return 0; + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -547,20 +332,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; - /* - * Just start the forker thread for our default backing_dev_info, - * and add other bdi's to the list. They will get a thread created - * on-demand when they need it. - */ - if (bdi_cap_flush_forker(bdi)) { - struct bdi_writeback *wb = &bdi->wb; - - wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", - dev_name(dev)); - if (IS_ERR(wb->task)) - return PTR_ERR(wb->task); - } - bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); @@ -584,8 +355,6 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct task_struct *task; - if (!bdi_cap_writeback_dirty(bdi)) return; @@ -594,23 +363,26 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ bdi_remove_from_list(bdi); + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + clear_bit(BDI_registered, &bdi->state); + spin_unlock_bh(&bdi->wb_lock); + /* - * If setup is pending, wait for that to complete first + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); + WARN_ON(!list_empty(&bdi->work_list)); /* - * Finally, kill the kernel thread. We don't need to be RCU - * safe anymore, since the bdi is gone from visibility. + * This shouldn't be necessary unless @bdi for some reason has + * unflushed dirty IO after work_list is drained. Do it anyway + * just in case. */ - spin_lock_bh(&bdi->wb_lock); - task = bdi->wb.task; - bdi->wb.task = NULL; - spin_unlock_bh(&bdi->wb_lock); - - if (task) - kthread_stop(task); + cancel_delayed_work_sync(&bdi->wb.dwork); } /* @@ -636,10 +408,8 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); - del_timer_sync(&bdi->wb.wakeup_timer); - if (!bdi_cap_flush_forker(bdi)) - bdi_wb_shutdown(bdi); + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); spin_lock_bh(&bdi->wb_lock); @@ -661,7 +431,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); spin_lock_init(&wb->list_lock); - setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } /* @@ -677,7 +447,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100; - bdi->max_prop_frac = PROP_FRAC_BASE; + bdi->max_prop_frac = FPROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->work_list); @@ -700,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = prop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions); if (err) { err: @@ -734,17 +504,16 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); /* - * If bdi_unregister() had already been called earlier, the - * wakeup_timer could still be armed because bdi_prune_sb() - * can race with the bdi_wakeup_thread_delayed() calls from - * __mark_inode_dirty(). + * If bdi_unregister() had already been called earlier, the dwork + * could still be pending because bdi_prune_sb() can race with the + * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty(). */ - del_timer_sync(&bdi->wb.wakeup_timer); + cancel_delayed_work_sync(&bdi->wb.dwork); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - prop_local_destroy_percpu(&bdi->completions); + fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); @@ -755,7 +524,6 @@ EXPORT_SYMBOL(bdi_destroy); int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, unsigned int cap) { - char tmp[32]; int err; bdi->name = name; @@ -764,8 +532,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, if (err) return err; - sprintf(tmp, "%.28s%s", name, "-%d"); - err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + err = bdi_register(bdi, NULL, "%.28s-%ld", name, + atomic_long_inc_return(&bdi_seq)); if (err) { bdi_destroy(bdi); return err; @@ -886,3 +654,23 @@ out: return ret; } EXPORT_SYMBOL(wait_iff_congested); + +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char kbuf[] = "0\n"; + + if (*ppos || *lenp < sizeof(kbuf)) { + *lenp = 0; + return 0; + } + + if (copy_to_user(buffer, kbuf, sizeof(kbuf))) + return -EFAULT; + printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", + table->procname); + + *lenp = 2; + *ppos += *lenp; + return 2; +} diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..6e45a5074bf0 --- /dev/null +++ b/mm/balloon_compaction.c @@ -0,0 +1,302 @@ +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> + */ +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/balloon_compaction.h> + +/* + * balloon_devinfo_alloc - allocates a balloon device information descriptor. + * @balloon_dev_descriptor: pointer to reference the balloon device which + * this struct balloon_dev_info will be servicing. + * + * Driver must call it to properly allocate and initialize an instance of + * struct balloon_dev_info which will be used to reference a balloon device + * as well as to keep track of the balloon device page list. + */ +struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) +{ + struct balloon_dev_info *b_dev_info; + b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); + if (!b_dev_info) + return ERR_PTR(-ENOMEM); + + b_dev_info->balloon_device = balloon_dev_descriptor; + b_dev_info->mapping = NULL; + b_dev_info->isolated_pages = 0; + spin_lock_init(&b_dev_info->pages_lock); + INIT_LIST_HEAD(&b_dev_info->pages); + + return b_dev_info; +} +EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); + +/* + * balloon_page_enqueue - allocates a new page and inserts it into the balloon + * page list. + * @b_dev_info: balloon device decriptor where we will insert a new page to + * + * Driver must call it to properly allocate a new enlisted balloon page + * before definetively removing it from the guest system. + * This function returns the page address for the recently enqueued page or + * NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!page) + return NULL; + + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. + */ + BUG_ON(!trylock_page(page)); + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * the its address to allow the driver release the page. + * @b_dev_info: balloon device decriptor where we will grab a page from. + * + * Driver must call it to properly de-allocate a previous enlisted balloon page + * before definetively releasing it back to the guest system. + * This function returns the page address for the recently dequeued page or + * NULL in the case we find balloon's page list temporarily empty due to + * compaction isolated pages. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + struct page *page, *tmp; + unsigned long flags; + bool dequeued_page; + + dequeued_page = false; + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + /* + * Block others from accessing the 'page' while we get around + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (trylock_page(page)) { + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + /* + * Raise the page refcount here to prevent any wrong + * attempt to isolate this page, in case of coliding + * with balloon_page_isolate() just after we release + * the page lock. + * + * balloon_page_free() will take care of dropping + * this extra refcount later. + */ + get_page(page); + balloon_page_delete(page); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + dequeued_page = true; + break; + } + } + + if (!dequeued_page) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there is no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck into + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + page = NULL; + } + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. + * @b_dev_info: holds the balloon device information descriptor. + * @a_ops: balloon_mapping address_space_operations descriptor. + * + * Driver must call it to properly allocate and initialize an instance of + * struct address_space which will be used as the special page->mapping for + * balloon device enlisted page instances. + */ +struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, + const struct address_space_operations *a_ops) +{ + struct address_space *mapping; + + mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + /* + * Give a clean 'zeroed' status to all elements of this special + * balloon page->mapping struct address_space instance. + */ + address_space_init_once(mapping); + + /* + * Set mapping->flags appropriately, to allow balloon pages + * ->mapping identification. + */ + mapping_set_balloon(mapping); + mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); + + /* balloon's page->mapping->a_ops callback descriptor */ + mapping->a_ops = a_ops; + + /* + * Establish a pointer reference back to the balloon device descriptor + * this particular page->mapping will be servicing. + * This is used by compaction / migration procedures to identify and + * access the balloon device pageset while isolating / migrating pages. + * + * As some balloon drivers can register multiple balloon devices + * for a single guest, this also helps compaction / migration to + * properly deal with multiple balloon pagesets, when required. + */ + mapping->private_data = b_dev_info; + b_dev_info->mapping = mapping; + + return mapping; +} +EXPORT_SYMBOL_GPL(balloon_mapping_alloc); + +static inline void __isolate_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline void __putback_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline int __migrate_balloon_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); +} + +/* __isolate_lru_page() counterpart for a ballooned page */ +bool balloon_page_isolate(struct page *page) +{ + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a balloon page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (likely(get_page_unless_zero(page))) { + /* + * As balloon pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the balloon driver releasing a page. + * + * In order to avoid having an already isolated balloon page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released by + * the balloon driver, lets be sure we have the page lock + * before proceeding with the balloon page isolation steps. + */ + if (likely(trylock_page(page))) { + /* + * A ballooned page, by default, has just one refcount. + * Prevent concurrent compaction threads from isolating + * an already isolated balloon page by refcount check. + */ + if (__is_movable_balloon_page(page) && + page_count(page) == 2) { + __isolate_balloon_page(page); + unlock_page(page); + return true; + } + unlock_page(page); + } + put_page(page); + } + return false; +} + +/* putback_lru_page() counterpart for a ballooned page */ +void balloon_page_putback(struct page *page) +{ + /* + * 'lock_page()' stabilizes the page and prevents races against + * concurrent isolation threads attempting to re-isolate it. + */ + lock_page(page); + + if (__is_movable_balloon_page(page)) { + __putback_balloon_page(page); + /* drop the extra ref count taken for page isolation */ + put_page(page); + } else { + WARN_ON(1); + dump_page(page, "not movable balloon page"); + } + unlock_page(page); +} + +/* move_to_new_page() counterpart for a ballooned page */ +int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct address_space *mapping; + int rc = -EAGAIN; + + /* + * Block others from accessing the 'newpage' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'newpage' at this point. + */ + BUG_ON(!trylock_page(newpage)); + + if (WARN_ON(!__is_movable_balloon_page(page))) { + dump_page(page, "not movable balloon page"); + unlock_page(newpage); + return rc; + } + + mapping = page->mapping; + if (mapping) + rc = __migrate_balloon_page(mapping, newpage, page, mode); + + unlock_page(newpage); + return rc; +} +#endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..90bd3507b413 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) */ static void __init link_bootmem(bootmem_data_t *bdata) { - struct list_head *iter; + bootmem_data_t *ent; - list_for_each(iter, &bdata_list) { - bootmem_data_t *ent; - - ent = list_entry(iter, bootmem_data_t, list); - if (bdata->node_min_pfn < ent->node_min_pfn) - break; + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_min_pfn < ent->node_min_pfn) { + list_add_tail(&bdata->list, &ent->list); + return; + } } - list_add_tail(&bdata->list, iter); + + list_add_tail(&bdata->list, &bdata_list); } /* @@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) /* * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * This is only useful when the bootmem allocator has already been torn * down, but we are still initializing the system. Pages are given directly * to the page allocator, no bootmem metadata is updated because it is gone. */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) +void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); + cursor = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), 0); @@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long start, end, pages, count = 0; + unsigned long *map, start, end, pages, count = 0; if (!bdata->node_bootmem_map) return 0; + map = bdata->node_bootmem_map; start = bdata->node_min_pfn; end = bdata->node_low_pfn; @@ -184,11 +185,23 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) bdata - bootmem_node_data, start, end); while (start < end) { - unsigned long *map, idx, vec; + unsigned long idx, vec; + unsigned shift; - map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; + shift = idx & (BITS_PER_LONG - 1); + /* + * vec holds at most BITS_PER_LONG map bits, + * bit 0 corresponds to start. + */ vec = ~map[idx / BITS_PER_LONG]; + + if (shift) { + vec >>= shift; + if (end - start >= BITS_PER_LONG) + vec |= ~map[idx / BITS_PER_LONG + 1] << + (BITS_PER_LONG - shift); + } /* * If we have a properly aligned and fully unreserved * BITS_PER_LONG block of pages in front of us, free @@ -201,18 +214,18 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long off = 0; + unsigned long cur = start; - while (vec && off < BITS_PER_LONG) { + start = ALIGN(start + 1, BITS_PER_LONG); + while (vec && cur != start) { if (vec & 1) { - page = pfn_to_page(start + off); + page = pfn_to_page(cur); __free_pages_bootmem(page, 0); count++; } vec >>= 1; - off++; + ++cur; } - start = ALIGN(start + 1, BITS_PER_LONG); } } @@ -228,16 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { - register_page_bootmem_info_node(pgdat); - return free_all_bootmem_core(pgdat->bdata); + struct zone *z; + + if (reset_managed_pages_done) + return; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -250,9 +273,13 @@ unsigned long __init free_all_bootmem(void) unsigned long total_pages = 0; bootmem_data_t *bdata; + reset_all_zones_managed_pages(); + list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); + totalram_pages += total_pages; + return total_pages; } @@ -376,21 +403,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. * * The range must be contiguous but may span node boundaries. */ -void __init free_bootmem(unsigned long addr, unsigned long size) +void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - start = PFN_UP(addr); - end = PFN_DOWN(addr + size); + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); mark_bootmem(start, end, 0, 0); } @@ -418,7 +445,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, } /** - * reserve_bootmem - mark a page range as usable + * reserve_bootmem - mark a page range as reserved * @addr: starting address of the range * @size: size of the range in bytes * @flags: reservation flags (see linux/bootmem.h) @@ -438,12 +465,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, - int flags) -{ - return reserve_bootmem(phys, len, flags); -} - static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { @@ -467,7 +488,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, return ALIGN(base + off, align) - base; } -static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -574,28 +595,7 @@ find_block: return NULL; } -static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) -{ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -#ifdef CONFIG_HAVE_ARCH_BOOTMEM - { - bootmem_data_t *p_bdata; - - p_bdata = bootmem_arch_preferred_node(bdata, size, align, - goal, limit); - if (p_bdata) - return alloc_bootmem_core(p_bdata, size, align, - goal, limit); - } -#endif - return NULL; -} - -static void * __init ___alloc_bootmem_nopanic(unsigned long size, +static void * __init alloc_bootmem_core(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -603,10 +603,8 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, bootmem_data_t *bdata; void *region; -restart: - region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); - if (region) - return region; + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) @@ -614,11 +612,25 @@ restart: if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) break; - region = alloc_bootmem_core(bdata, size, align, goal, limit); + region = alloc_bootmem_bdata(bdata, size, align, goal, limit); if (region) return region; } + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +restart: + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; if (goal) { goal = 0; goto restart; @@ -684,21 +696,58 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { void *ptr; - ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); +again: + + /* do not panic in alloc_bootmem_bdata() */ + if (limit && goal + size > limit) + limit = 0; + + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); if (ptr) return ptr; - ptr = alloc_bootmem_core(bdata, size, align, goal, limit); + ptr = alloc_bootmem_core(size, align, goal, limit); if (ptr) return ptr; - return ___alloc_bootmem(size, align, goal, limit); + if (goal) { + goal = 0; + goto again; + } + + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); +} + +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } /** @@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -735,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end_pfn = pgdat_end_pfn(pgdat); if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { @@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; - ptr = alloc_bootmem_core(pgdat->bdata, size, align, + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, new_goal, 0); if (ptr) return ptr; @@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) -{ - bootmem_data_t *bdata; - unsigned long pfn, goal; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); -} -#endif - -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) -{ - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - - ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); - if (ptr) - return ptr; - - return __alloc_bootmem_nopanic(size, align, goal); -} - #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif @@ -818,6 +826,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from @@ -839,6 +855,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index d1be02ca1889..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -24,23 +24,25 @@ static mempool_t *page_pool, *isa_page_pool; -#ifdef CONFIG_HIGHMEM +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) static __init int init_emergency_pool(void) { -#ifndef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) if (max_pfn <= max_low_pfn) return 0; #endif page_pool = mempool_create_page_pool(POOL_SIZE, 0); BUG_ON(!page_pool); - printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + printk("bounce pool size: %d pages\n", POOL_SIZE); return 0; } __initcall(init_emergency_pool); +#endif +#ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ @@ -96,27 +98,24 @@ int init_emergency_isa_pool(void) static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; - struct bio_vec *tovec, *fromvec; - int i; - - __bio_for_each_segment(tovec, to, i, 0) { - fromvec = from->bi_io_vec + i; - - /* - * not bounced - */ - if (tovec->bv_page == fromvec->bv_page) - continue; - - /* - * fromvec->bv_offset and fromvec->bv_len might have been - * modified by the block layer, so use the original copy, - * bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; + struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bvec_iter iter; + + bio_for_each_segment(tovec, to, iter) { + if (tovec.bv_page != fromvec->bv_page) { + /* + * fromvec->bv_offset and fromvec->bv_len might have + * been modified by the block layer, so use the original + * copy, bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + + tovec.bv_offset; + + bounce_copy_vec(&tovec, vfrom); + flush_dcache_page(tovec.bv_page); + } - bounce_copy_vec(tovec, vfrom); - flush_dcache_page(tovec->bv_page); + fromvec++; } } @@ -132,7 +131,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) /* * free up bounce indirect pages used */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { org_vec = bio_orig->bi_io_vec + i; if (bvec->bv_page == org_vec->bv_page) continue; @@ -176,81 +175,67 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) __bounce_end_io_read(bio, isa_page_pool, err); } -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) +#ifdef CONFIG_NEED_BOUNCE_POOL +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) { - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); - struct bio_vec *to, *from; - - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; - - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) <= queue_bounce_pfn(q)) - continue; + if (bio_data_dir(bio) != WRITE) + return 0; - /* - * irk, bounce it - */ - if (!bio) { - unsigned int cnt = (*bio_orig)->bi_vcnt; + if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) + return 0; - bio = bio_alloc(GFP_NOIO, cnt); - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); - } - + return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); +} +#else +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + return 0; +} +#endif /* CONFIG_NEED_BOUNCE_POOL */ - to = bio->bi_io_vec + i; +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, + mempool_t *pool, int force) +{ + struct bio *bio; + int rw = bio_data_dir(*bio_orig); + struct bio_vec *to, from; + struct bvec_iter iter; + unsigned i; + + if (force) + goto bounce; + bio_for_each_segment(from, *bio_orig, iter) + if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) + goto bounce; + + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + + bio_for_each_segment_all(to, bio, i) { + struct page *page = to->bv_page; + + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); if (rw == WRITE) { char *vto, *vfrom; - flush_dcache_page(from->bv_page); + flush_dcache_page(page); + vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); + kunmap_atomic(vfrom); } } - /* - * no pages bounced - */ - if (!bio) - return; - trace_block_bio_bounce(q, *bio_orig); - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; if (pool == page_pool) { bio->bi_end_io = bounce_end_io_write; @@ -268,6 +253,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { + int must_bounce; mempool_t *pool; /* @@ -276,13 +262,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bio_has_data(*bio_orig)) return; + must_bounce = must_snapshot_stable_pages(q, *bio_orig); + /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) return; pool = page_pool; } else { @@ -293,7 +281,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * slow path */ - __blk_queue_bounce(q, bio_orig, pool); + __blk_queue_bounce(q, bio_orig, pool, must_bounce); } EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f613..d0eac4350403 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,20 +19,10 @@ #include <linux/cleancache.h> /* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/invalidate even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled __read_mostly; -EXPORT_SYMBOL(cleancache_enabled); - -/* * cleancache_ops is set by cleancache_ops_register to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops __read_mostly; +static struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -45,15 +35,101 @@ static u64 cleancache_puts; static u64 cleancache_invalidates; /* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting + * When no backend is registered all calls to init_fs and init_shared_fs + * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or + * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array + * [shared_|]fs_poolid_map) are given to the respective super block + * (sb->cleancache_poolid) and no tmem_pools are created. When a backend + * registers with cleancache the previous calls to init_fs and init_shared_fs + * are executed to create tmem_pools and set the respective poolids. While no + * backend is registered all "puts", "gets" and "flushes" are ignored or failed. + */ +#define MAX_INITIALIZABLE_FS 32 +#define FAKE_FS_POOLID_OFFSET 1000 +#define FAKE_SHARED_FS_POOLID_OFFSET 2000 + +#define FS_NO_BACKEND (-1) +#define FS_UNKNOWN (-2) +static int fs_poolid_map[MAX_INITIALIZABLE_FS]; +static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; +static char *uuids[MAX_INITIALIZABLE_FS]; +/* + * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads + * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple + * threads calling mount (and ending up in __cleancache_init_[shared|]fs). + */ +static DEFINE_MUTEX(poolid_mutex); +/* + * When set to false (default) all calls to the cleancache functions, except + * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded + * by the if (!cleancache_ops) return. This means multiple threads (from + * different filesystems) will be checking cleancache_ops. The usage of a + * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are + * OK if the time between the backend's have been initialized (and + * cleancache_ops has been set to not NULL) and when the filesystems start + * actually calling the backends. The inverse (when unloading) is obviously + * not good - but this shim does not do that (yet). + */ + +/* + * The backends and filesystems work all asynchronously. This is b/c the + * backends can be built as modules. + * The usual sequence of events is: + * a) mount / -> __cleancache_init_fs is called. We set the + * [shared_|]fs_poolid_map and uuids for. + * + * b). user does I/Os -> we call the rest of __cleancache_* functions + * which return immediately as cleancache_ops is false. + * + * c). modprobe zcache -> cleancache_register_ops. We init the backend + * and set cleancache_ops to true, and for any fs_poolid_map + * (which is set by __cleancache_init_fs) we initialize the poolid. + * + * d). user does I/Os -> now that cleancache_ops is true all the + * __cleancache_* functions can call the backend. They all check + * that fs_poolid_map is valid and if so invoke the backend. + * + * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is + * reset (which is the second check in the __cleancache_* ops + * to call the backend). + * + * The sequence of event could also be c), followed by a), and d). and e). The + * c) would not happen anymore. There is also the chance of c), and one thread + * doing a) + d), and another doing e). For that case we depend on the + * filesystem calling __cleancache_invalidate_fs in the proper sequence (so + * that it handles all I/Os before it invalidates the fs (which is last part + * of unmounting process). + * + * Note: The acute reader will notice that there is no "rmmod zcache" case. + * This is b/c the functionality for that is not yet implemented and when + * done, will require some extra locking not yet devised. + */ + +/* + * Register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting. */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops old = cleancache_ops; + struct cleancache_ops *old = cleancache_ops; + int i; - cleancache_ops = *ops; - cleancache_enabled = 1; + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_NO_BACKEND) + fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); + if (shared_fs_poolid_map[i] == FS_NO_BACKEND) + shared_fs_poolid_map[i] = ops->init_shared_fs + (uuids[i], PAGE_SIZE); + } + /* + * We MUST set cleancache_ops _after_ we have called the backends + * init_fs or init_shared_fs functions. Otherwise the compiler might + * re-order where cleancache_ops is set in this function. + */ + barrier(); + cleancache_ops = ops; + mutex_unlock(&poolid_mutex); return old; } EXPORT_SYMBOL(cleancache_register_ops); @@ -61,15 +137,42 @@ EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; + if (cleancache_ops) + fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); + else + fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) { - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); + int i; + + mutex_lock(&poolid_mutex); + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + if (shared_fs_poolid_map[i] == FS_UNKNOWN) { + sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; + uuids[i] = uuid; + if (cleancache_ops) + shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs + (uuid, PAGE_SIZE); + else + shared_fs_poolid_map[i] = FS_NO_BACKEND; + break; + } + } + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -80,7 +183,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); static int cleancache_get_key(struct inode *inode, struct cleancache_filekey *key) { - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); int len = 0, maxlen = CLEANCACHE_KEY_MAX; struct super_block *sb = inode->i_sb; @@ -88,10 +191,8 @@ static int cleancache_get_key(struct inode *inode, if (sb->s_export_op != NULL) { fhfn = sb->s_export_op->encode_fh; if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) + len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); + if (len <= FILEID_ROOT || len == FILEID_INVALID) return -1; if (maxlen > CLEANCACHE_KEY_MAX) return -1; @@ -101,27 +202,53 @@ static int cleancache_get_key(struct inode *inode, } /* + * Returns a pool_id that is associated with a given fake poolid. + */ +static int get_poolid_from_fake(int fake_pool_id) +{ + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) + return shared_fs_poolid_map[fake_pool_id - + FAKE_SHARED_FS_POOLID_OFFSET]; + else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) + return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; + return FS_NO_BACKEND; +} + +/* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if * successful, use it to fill the specified page with data and return 0. * The pageframe is unchanged and returns -1 if the get fails. * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + + VM_BUG_ON_PAGE(!PageLocked(page), page); + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) goto out; + pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (pool_id >= 0) + ret = cleancache_ops->get_page(pool_id, + key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -136,17 +263,32 @@ EXPORT_SYMBOL(__cleancache_get_page); * (previously-obtained per-filesystem) poolid and the page's, * inode and page index. Page must be locked. Note that a put_page * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_put_page(struct page *page) { int pool_id; + int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; - VM_BUG_ON(!PageLocked(page)); - pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (!cleancache_ops) { + cleancache_puts++; + return; + } + + VM_BUG_ON_PAGE(!PageLocked(page), page); + fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); cleancache_puts++; } } @@ -155,19 +297,31 @@ EXPORT_SYMBOL(__cleancache_put_page); /* * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; - if (pool_id >= 0) { - VM_BUG_ON(!PageLocked(page)); + if (!cleancache_ops) + return; + + if (fake_pool_id >= 0) { + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id < 0) + return; + + VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.invalidate_page)(pool_id, - key, page->index); + cleancache_ops->invalidate_page(pool_id, + key, page->index); cleancache_invalidates++; } } @@ -178,34 +332,63 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id; + int fake_pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + + if (fake_pool_id < 0) + return; + + pool_id = get_poolid_from_fake(fake_pool_id); + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.invalidate_inode)(pool_id, key); + cleancache_ops->invalidate_inode(pool_id, key); } EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. */ void __cleancache_invalidate_fs(struct super_block *sb) { - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.invalidate_fs)(old_poolid); + int index; + int fake_pool_id = sb->cleancache_poolid; + int old_poolid = fake_pool_id; + + mutex_lock(&poolid_mutex); + if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; + old_poolid = shared_fs_poolid_map[index]; + shared_fs_poolid_map[index] = FS_UNKNOWN; + uuids[index] = NULL; + } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { + index = fake_pool_id - FAKE_FS_POOLID_OFFSET; + old_poolid = fs_poolid_map[index]; + fs_poolid_map[index] = FS_UNKNOWN; } + sb->cleancache_poolid = -1; + if (cleancache_ops) + cleancache_ops->invalidate_fs(old_poolid); + mutex_unlock(&poolid_mutex); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { + int i; + #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -217,6 +400,10 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif + for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { + fs_poolid_map[i] = FS_UNKNOWN; + shared_fs_poolid_map[i] = FS_UNKNOWN; + } return 0; } module_init(init_cleancache) diff --git a/mm/compaction.c b/mm/compaction.c index 74a8c825ff28..627dc2e4320f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -14,32 +14,30 @@ #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> +#include <linux/balloon_compaction.h> +#include <linux/page-isolation.h> #include "internal.h" +#ifdef CONFIG_COMPACTION +static inline void count_compact_event(enum vm_event_item item) +{ + count_vm_event(item); +} + +static inline void count_compact_events(enum vm_event_item item, long delta) +{ + count_vm_events(item, delta); +} +#else +#define count_compact_event(item) do { } while (0) +#define count_compact_events(item, delta) do { } while (0) +#endif + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> -/* - * compact_control is used to track pages being migrated and the free pages - * they are being migrated to during memory compaction. The free_pfn starts - * at the end of a zone and migrate_pfn begins at the start. Movable pages - * are moved to the end of a zone during a compaction run and the run - * completes when free_pfn <= migrate_pfn - */ -struct compact_control { - struct list_head freepages; /* List of free pages to migrate to */ - struct list_head migratepages; /* List of pages being migrated */ - unsigned long nr_freepages; /* Number of isolated free pages */ - unsigned long nr_migratepages; /* Number of pages to migrate */ - unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ - - int order; /* order a direct compactor needs */ - int migratetype; /* MOVABLE, RECLAIMABLE etc */ - struct zone *zone; -}; - static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -54,37 +52,244 @@ static unsigned long release_freepages(struct list_head *freelist) return count; } -/* Isolate free pages onto a private freelist. Must hold zone->lock */ -static unsigned long isolate_freepages_block(struct zone *zone, - unsigned long blockpfn, - struct list_head *freelist) +static void map_pages(struct list_head *list) { - unsigned long zone_end_pfn, end_pfn; - int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *page; - /* Get the last PFN we should scan for free pages at */ - zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + list_for_each_entry(page, list, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } +} - /* Find the first usable PFN in the block to initialse page cursor */ - for (; blockpfn < end_pfn; blockpfn++) { - if (pfn_valid_within(blockpfn)) - break; +static inline bool migrate_async_suitable(int migratetype) +{ + return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; +} + +#ifdef CONFIG_COMPACTION +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void __reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long pfn; + + zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_free_pfn = end_pfn; + zone->compact_blockskip_flush = false; + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } +} + +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by __reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ + struct zone *zone = cc->zone; + + if (cc->ignore_skip_hint) + return; + + if (!page) + return; + + if (!nr_isolated) { + unsigned long pfn = page_to_pfn(page); + set_pageblock_skip(page); + + /* Update where compaction should restart */ + if (migrate_scanner) { + if (!cc->finished_update_migrate && + pfn > zone->compact_cached_migrate_pfn) + zone->compact_cached_migrate_pfn = pfn; + } else { + if (!cc->finished_update_free && + pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; + } + } +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ +} +#endif /* CONFIG_COMPACTION */ + +static inline bool should_release_lock(spinlock_t *lock) +{ + return need_resched() || spin_is_contended(lock); +} + +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. Check if the process needs to be scheduled or + * if the lock is contended. For async compaction, back out in the event + * if contention is severe. For sync compaction, schedule. + * + * Returns true if the lock is held. + * Returns false if the lock is released and compaction should abort + */ +static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, + bool locked, struct compact_control *cc) +{ + if (should_release_lock(lock)) { + if (locked) { + spin_unlock_irqrestore(lock, *flags); + locked = false; + } + + /* async aborts if taking too long or contended */ + if (!cc->sync) { + cc->contended = true; + return false; + } + + cond_resched(); } + + if (!locked) + spin_lock_irqsave(lock, *flags); + return true; +} + +static inline bool compact_trylock_irqsave(spinlock_t *lock, + unsigned long *flags, struct compact_control *cc) +{ + return compact_checklock_irqsave(lock, flags, false, cc); +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return false; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Isolate free pages onto a private freelist. If @strict is true, will abort + * returning 0 on any invalid PFNs or non-free pages inside of the pageblock + * (even though it may still end up isolating some pages). + */ +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long blockpfn, + unsigned long end_pfn, + struct list_head *freelist, + bool strict) +{ + int nr_scanned = 0, total_isolated = 0; + struct page *cursor, *valid_page = NULL; + unsigned long flags; + bool locked = false; + bool checked_pageblock = false; + cursor = pfn_to_page(blockpfn); - /* Isolate free pages. This assumes the block is valid */ + /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { int isolated, i; struct page *page = cursor; - if (!pfn_valid_within(blockpfn)) - continue; nr_scanned++; + if (!pfn_valid_within(blockpfn)) + goto isolate_fail; + if (!valid_page) + valid_page = page; if (!PageBuddy(page)) - continue; + goto isolate_fail; + + /* + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_checklock_irqsave(&cc->zone->lock, &flags, + locked, cc); + if (!locked) + break; + + /* Recheck this is a suitable migration target under lock */ + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto isolate_fail; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); @@ -98,125 +303,104 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (isolated) { blockpfn += isolated - 1; cursor += isolated - 1; + continue; } - } - trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); - return total_isolated; -} +isolate_fail: + if (strict) + break; + else + continue; -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) -{ + } - int migratetype = get_pageblock_migratetype(page); + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return false; + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && blockpfn < end_pfn) + total_isolated = 0; - /* If the page is a large free page, then allow migration */ - if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); - /* If the block is MIGRATE_MOVABLE, allow migration */ - if (migratetype == MIGRATE_MOVABLE) - return true; + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(cc, valid_page, total_isolated, false); - /* Otherwise skip the block */ - return false; + count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) + count_compact_events(COMPACTISOLATED, total_isolated); + return total_isolated; } -/* - * Based on information in the current compact_control, find blocks - * suitable for isolating free pages from and then isolate them. +/** + * isolate_freepages_range() - isolate free pages. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Non-free pages, invalid PFNs, or zone boundaries within the + * [start_pfn, end_pfn) range are considered errors, cause function to + * undo its actions and return zero. + * + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater then end_pfn if end fell in a middle of + * a free page). */ -static void isolate_freepages(struct zone *zone, - struct compact_control *cc) +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) { - struct page *page; - unsigned long high_pfn, low_pfn, pfn; - unsigned long flags; - int nr_freepages = cc->nr_freepages; - struct list_head *freelist = &cc->freepages; - - /* - * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. - */ - pfn = cc->free_pfn; - low_pfn = cc->migrate_pfn + pageblock_nr_pages; + unsigned long isolated, pfn, block_end_pfn; + LIST_HEAD(freelist); - /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. - */ - high_pfn = min(low_pfn, pfn); - - /* - * Isolate free pages until enough are available to migrate the - * pages on cc->migratepages. We stop searching if the migrate - * and free page scanners meet or enough free pages are isolated. - */ - for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { - unsigned long isolated; - - if (!pfn_valid(pfn)) - continue; + for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { + if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) + break; /* - * Check for overlapping nodes/zones. It's possible on some - * configurations to have a setup like - * node0 node1 node0 - * i.e. it's possible that all pages within a zones range of - * pages do not belong to a single zone. + * On subsequent iterations ALIGN() is actually not needed, + * but we keep it that we not to complicate the code. */ - page = pfn_to_page(pfn); - if (page_zone(page) != zone) - continue; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); - /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) - continue; + isolated = isolate_freepages_block(cc, pfn, block_end_pfn, + &freelist, true); /* - * Found a block suitable for isolating free pages from. Now - * we disabled interrupts, double check things are ok and - * isolate the pages. This is to minimise the time IRQs - * are disabled + * In strict mode, isolate_freepages_block() returns 0 if + * there are any holes in the block (ie. invalid PFNs or + * non-free pages). */ - isolated = 0; - spin_lock_irqsave(&zone->lock, flags); - if (suitable_migration_target(page)) { - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; - } - spin_unlock_irqrestore(&zone->lock, flags); + if (!isolated) + break; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * If we managed to isolate pages, it is always (1 << n) * + * pageblock_nr_pages for some non-negative n. (Max order + * page may span two pageblocks). */ - if (isolated) - high_pfn = max(high_pfn, pfn); } /* split_free_page does not map the pages */ - list_for_each_entry(page, freelist, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); + map_pages(&freelist); + + if (pfn < end_pfn) { + /* Loop terminated early, cleanup. */ + release_freepages(&freelist); + return 0; } - cc->free_pfn = high_pfn; - cc->nr_freepages = nr_freepages; + /* We don't use freelists for anything. */ + return pfn; } /* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) +static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) { struct page *page; unsigned int count[2] = { 0, }; @@ -224,8 +408,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) list_for_each_entry(page, &cc->migratepages, lru) count[!!page_is_file_cache(page)]++; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + /* If locked we can use the interrupt unsafe versions */ + if (locked) { + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } else { + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } } /* Similar to reclaim, but different enough that they don't share logic */ @@ -243,37 +433,40 @@ static bool too_many_isolated(struct zone *zone) return isolated > (inactive + active) / 2; } -/* possible outcome of isolate_migratepages */ -typedef enum { - ISOLATE_ABORT, /* Abort compaction now */ - ISOLATE_NONE, /* No pages isolated, continue scanning */ - ISOLATE_SUCCESS, /* Pages isolated, migrate */ -} isolate_migrate_t; - -/* - * Isolate all pages that can be migrated from the block pointed to by - * the migrate scanner within compact_control. +/** + * isolate_migratepages_range() - isolate all migrate-able pages in range. + * @zone: Zone pages are in. + * @cc: Compaction control structure. + * @low_pfn: The first PFN of the range. + * @end_pfn: The one-past-the-last PFN of the range. + * @unevictable: true if it allows to isolate unevictable pages + * + * Isolate all pages that can be migrated from the range specified by + * [low_pfn, end_pfn). Returns zero if there is a fatal signal + * pending), otherwise PFN of the first page that was not scanned + * (which may be both less, equal to or more then end_pfn). + * + * Assumes that cc->migratepages is empty and cc->nr_migratepages is + * zero. + * + * Apart from cc->migratepages and cc->nr_migratetypes this function + * does not modify any cc's fields, in particular it does not modify + * (or read for that matter) cc->migrate_pfn. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn, bool unevictable) { - unsigned long low_pfn, end_pfn; unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; - - /* Do not scan outside zone boundaries */ - low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); - - /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); - - /* Do not cross the free scanner or scan within a memory hole */ - if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { - cc->migrate_pfn = end_pfn; - return ISOLATE_NONE; - } + struct lruvec *lruvec; + unsigned long flags; + bool locked = false; + struct page *page = NULL, *valid_page = NULL; + bool skipped_async_unsuitable = false; + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -283,35 +476,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ if (!cc->sync) - return ISOLATE_ABORT; + return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return ISOLATE_ABORT; + return 0; } /* Time to isolate some pages for migration */ cond_resched(); - spin_lock_irq(&zone->lru_lock); for (; low_pfn < end_pfn; low_pfn++) { - struct page *page; - bool locked = true; - /* give a chance to irqs before checking need_resched() */ - if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { - spin_unlock_irq(&zone->lru_lock); - locked = false; + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { + if (should_release_lock(&zone->lru_lock)) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } } - if (need_resched() || spin_is_contended(&zone->lru_lock)) { - if (locked) - spin_unlock_irq(&zone->lru_lock); - cond_resched(); - spin_lock_irq(&zone->lru_lock); - if (fatal_signal_pending(current)) - break; - } else if (!locked) - spin_lock_irq(&zone->lru_lock); /* * migrate_pfn does not necessarily start aligned to a @@ -340,48 +522,106 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (page_zone(page) != zone) continue; - /* Skip if free */ + if (!valid_page) + valid_page = page; + + /* If isolation recently failed, do not retry */ + pageblock_nr = low_pfn >> pageblock_order; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (!cc->sync && !migrate_async_suitable(mt)) { + cc->finished_update_migrate = true; + skipped_async_unsuitable = true; + goto next_pageblock; + } + } + + /* + * Skip if free. page_order cannot be used without zone->lock + * as nothing prevents parallel allocations or buddy merging. + */ if (PageBuddy(page)) continue; /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU pages and balloon pages + * Skip any other type of page */ - pageblock_nr = low_pfn >> pageblock_order; - if (!cc->sync && last_pageblock_nr != pageblock_nr && - get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { - low_pfn += pageblock_nr_pages; - low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; + if (!PageLRU(page)) { + if (unlikely(balloon_page_movable(page))) { + if (locked && balloon_page_isolate(page)) { + /* Successfully isolated */ + goto isolate_success; + } + } continue; } - if (!PageLRU(page)) + /* + * PageLRU is set. lru_lock normally excludes isolation + * splitting and collapsing (collapsing has already happened + * if PageLRU is set) but the lock is not necessarily taken + * here and it is wasteful to take it just to check transhuge. + * Check TransHuge without lock and skip the whole pageblock if + * it's either a transhuge or hugetlbfs page, as calling + * compound_order() without preventing THP from splitting the + * page underneath us may return surprising results. + */ + if (PageTransHuge(page)) { + if (!locked) + goto next_pageblock; + low_pfn += (1 << compound_order(page)) - 1; continue; + } /* - * PageLRU is set, and lru_lock excludes isolation, - * splitting and collapsing (collapsing has already - * happened if PageLRU is set). + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); + if (!locked || fatal_signal_pending(current)) + break; + + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; if (PageTransHuge(page)) { low_pfn += (1 << compound_order(page)) - 1; continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; + lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ - if (__isolate_lru_page(page, mode, 0) != 0) + if (__isolate_lru_page(page, mode) != 0) continue; - VM_BUG_ON(PageTransCompound(page)); + VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ - del_page_from_lru_list(zone, page, page_lru(page)); + del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; @@ -391,16 +631,145 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, ++low_pfn; break; } + + continue; + +next_pageblock: + low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; } - acct_isolated(zone, cc); + acct_isolated(zone, locked, cc); - spin_unlock_irq(&zone->lru_lock); - cc->migrate_pfn = low_pfn; + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); + + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + * This is not done when pageblock was skipped due to being unsuitable + * for async compaction, so that eventual sync compaction can try. + */ + if (low_pfn == end_pfn && !skipped_async_unsuitable) + update_pageblock_skip(cc, valid_page, nr_isolated, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); - return ISOLATE_SUCCESS; + count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); + if (nr_isolated) + count_compact_events(COMPACTISOLATED, nr_isolated); + + return low_pfn; +} + +#endif /* CONFIG_COMPACTION || CONFIG_CMA */ +#ifdef CONFIG_COMPACTION +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn, z_end_pfn; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + /* + * Initialise the free scanner. The starting point is where we last + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do pfn -= pageblock_nr_pages + * in the for loop. + * The low boundary is the end of the pageblock the migration scanner + * is using. + */ + pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + + /* + * Take care that if the migration scanner is at the end of the zone + * that the free scanner does not accidentally move to the next zone + * in the next isolation cycle. + */ + high_pfn = min(low_pfn, pfn); + + z_end_pfn = zone_end_pfn(zone); + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + unsigned long end_pfn; + + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check if we need + * to schedule. + */ + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* Found a block suitable for isolating free pages from */ + isolated = 0; + + /* + * Take care when isolating in last pageblock of a zone which + * ends in the middle of a pageblock. + */ + end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); + isolated = isolate_freepages_block(cc, pfn, end_pfn, + freelist, false); + nr_freepages += isolated; + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) { + cc->finished_update_free = true; + high_pfn = max(high_pfn, pfn); + } + } + + /* split_free_page does not map the pages */ + map_pages(freelist); + + /* + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ + if (pfn < low_pfn) + cc->free_pfn = max(pfn, zone->zone_start_pfn); + else + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; } /* @@ -449,6 +818,44 @@ static void update_nr_listpages(struct compact_control *cc) cc->nr_freepages = nr_freepages; } +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static isolate_migrate_t isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return ISOLATE_NONE; + } + + /* Perform the isolation */ + low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); + if (!low_pfn || cc->contended) + return ISOLATE_ABORT; + + cc->migrate_pfn = low_pfn; + + return ISOLATE_SUCCESS; +} + static int compact_finished(struct zone *zone, struct compact_control *cc) { @@ -459,8 +866,22 @@ static int compact_finished(struct zone *zone, return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) + if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (!current_is_kswapd()) + zone->compact_blockskip_flush = true; + return COMPACT_COMPLETE; + } /* * order == -1 is expected when compacting via @@ -478,12 +899,14 @@ static int compact_finished(struct zone *zone, /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + /* Job done if page is free of the right migratetype */ - if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) + if (!list_empty(&area->free_list[cc->migratetype])) return COMPACT_PARTIAL; /* Job done if allocation would set block type */ - if (order >= pageblock_order && zone->free_area[order].nr_free) + if (cc->order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; } @@ -543,6 +966,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -555,10 +980,31 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - /* Setup to move all movable pages to the end of the zone */ - cc->migrate_pfn = zone->zone_start_pfn; - cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; - cc->free_pfn &= ~(pageblock_nr_pages-1); + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ + cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn = cc->migrate_pfn; + } + + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); migrate_prep_local(); @@ -569,6 +1015,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: continue; @@ -578,24 +1026,28 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + (unsigned long)cc, + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; - count_vm_event(COMPACTBLOCKS); - count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); - if (nr_remaining) - count_vm_events(COMPACTPAGEFAILED, nr_remaining); trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); - /* Release LRU pages not migrated */ + /* Release isolated pages not migrated */ if (err) { - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { + ret = COMPACT_PARTIAL; + goto out; + } } - } out: @@ -603,13 +1055,16 @@ out: cc->nr_freepages -= release_freepages(&cc->freepages); VM_BUG_ON(cc->nr_freepages != 0); + trace_mm_compaction_end(ret); + return ret; } static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - bool sync) + bool sync, bool *contended) { + unsigned long ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -621,7 +1076,13 @@ static unsigned long compact_zone_order(struct zone *zone, INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - return compact_zone(zone, &cc); + ret = compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + *contended = cc.contended; + return ret; } int sysctl_extfrag_threshold = 500; @@ -633,12 +1094,14 @@ int sysctl_extfrag_threshold = 500; * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from * @sync: Whether migration is synchronous or not + * @contended: Return value that is true if compaction was aborted due to lock contention + * @page: Optionally capture a free page of the requested order during compaction * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync) + bool sync, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -646,27 +1109,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; + int alloc_flags = 0; - /* - * Check whether it is worth even starting compaction. The order check is - * made because an assumption is made that the page allocator can satisfy - * the "cheaper" orders without taking special steps - */ + /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) return rc; - count_vm_event(COMPACTSTALL); + count_compact_event(COMPACTSTALL); +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + contended); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, + alloc_flags)) break; } @@ -675,7 +1141,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact all zones within a node */ -static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) { int zoneid; struct zone *zone; @@ -696,44 +1162,45 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) compact_zone(zone, cc); if (cc->order > 0) { - int ok = zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0); - if (ok && cc->order > zone->compact_order_failed) - zone->compact_order_failed = cc->order + 1; + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); /* Currently async compaction is never deferred. */ - else if (!ok && cc->sync) + else if (cc->sync) defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); } - - return 0; } -int compact_pgdat(pg_data_t *pgdat, int order) +void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, .sync = false, }; - return __compact_pgdat(pgdat, &cc); + if (!order) + return; + + __compact_pgdat(pgdat, &cc); } -static int compact_node(int nid) +static void compact_node(int nid) { struct compact_control cc = { .order = -1, .sync = true, + .ignore_skip_hint = true, }; - return __compact_pgdat(NODE_DATA(nid), &cc); + __compact_pgdat(NODE_DATA(nid), &cc); } /* Compact all nodes in the system */ -static int compact_nodes(void) +static void compact_nodes(void) { int nid; @@ -742,8 +1209,6 @@ static int compact_nodes(void) for_each_online_node(nid) compact_node(nid); - - return COMPACT_COMPLETE; } /* The written value is actually unused, all memory is compacted */ @@ -754,7 +1219,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { if (write) - return compact_nodes(); + compact_nodes(); return 0; } @@ -768,7 +1233,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct device *dev, +static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -795,3 +1260,5 @@ void compaction_unregister_node(struct node *node) return device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +#endif /* CONFIG_COMPACTION */ diff --git a/mm/dmapool.c b/mm/dmapool.c index c5ab33bca0a8..c69781e97cf9 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -50,7 +50,6 @@ struct dma_pool { /* the pool */ size_t allocation; size_t boundary; char name[32]; - wait_queue_head_t waitq; struct list_head pools; }; @@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ unsigned int offset; }; -#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) - static DEFINE_MUTEX(pools_lock); static ssize_t @@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; - init_waitqueue_head(&retval->waitq); if (dev) { int ret; @@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif pool_initialise_page(pool, page); - list_add(&page->page_list, &pool->page_list); page->in_use = 0; page->offset = 0; } else { @@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, might_sleep_if(mem_flags & __GFP_WAIT); spin_lock_irqsave(&pool->lock, flags); - restart: list_for_each_entry(page, &pool->page_list, page_list) { if (page->offset < pool->allocation) goto ready; } - page = pool_alloc_page(pool, GFP_ATOMIC); - if (!page) { - if (mem_flags & __GFP_WAIT) { - DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_UNINTERRUPTIBLE); - __add_wait_queue(&pool->waitq, &wait); - spin_unlock_irqrestore(&pool->lock, flags); + /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ + spin_unlock_irqrestore(&pool->lock, flags); - schedule_timeout(POOL_TIMEOUT_JIFFIES); + page = pool_alloc_page(pool, mem_flags); + if (!page) + return NULL; - spin_lock_irqsave(&pool->lock, flags); - __remove_wait_queue(&pool->waitq, &wait); - goto restart; - } - retval = NULL; - goto done; - } + spin_lock_irqsave(&pool->lock, flags); + list_add(&page->page_list, &pool->page_list); ready: page->in_use++; offset = page->offset; @@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, retval = offset + page->vaddr; *handle = offset + page->dma; #ifdef DMAPOOL_DEBUG + { + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(page->offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + if (pool->dev) + dev_err(pool->dev, + "dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + else + pr_err("dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + } memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif - done: spin_unlock_irqrestore(&pool->lock, flags); return retval; } @@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; - if (waitqueue_active(&pool->waitq)) - wake_up_locked(&pool->waitq); /* * Resist a temptation to do * if (!is_page_busy(page)) pool_free_page(pool, page); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 000000000000..e10ccd299d66 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/fixmap.h> + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..3bcfd81db45e 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -17,6 +17,7 @@ #include <linux/fadvise.h> #include <linux/writeback.h> #include <linux/syscalls.h> +#include <linux/swap.h> #include <asm/unistd.h> @@ -24,9 +25,9 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) { - struct file *file = fget(fd); + struct fd f = fdget(fd); struct address_space *mapping; struct backing_dev_info *bdi; loff_t endbyte; /* inclusive */ @@ -35,15 +36,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) unsigned long nrpages; int ret = 0; - if (!file) + if (!f.file) return -EBADF; - if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { + if (S_ISFIFO(file_inode(f.file)->i_mode)) { ret = -ESPIPE; goto out; } - mapping = file->f_mapping; + mapping = f.file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; goto out; @@ -76,28 +77,23 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&file->f_lock); - file->f_mode |= FMODE_RANDOM; - spin_unlock(&file->f_lock); + spin_lock(&f.file->f_lock); + f.file->f_mode |= FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&file->f_lock); - file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&file->f_lock); + f.file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); break; case POSIX_FADV_WILLNEED: - if (!mapping->a_ops->readpage) { - ret = -EINVAL; - break; - } - /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; end_index = endbyte >> PAGE_CACHE_SHIFT; @@ -106,12 +102,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) nrpages = end_index - start_index + 1; if (!nrpages) nrpages = ~0UL; - - ret = force_page_cache_readahead(mapping, file, - start_index, - nrpages); - if (ret > 0) - ret = 0; + + /* + * Ignore return value because fadvise() shall return + * success even if filesystem can't retrieve a hint, + */ + force_page_cache_readahead(mapping, f.file, start_index, + nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -124,37 +121,36 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index >= start_index) - invalidate_mapping_pages(mapping, start_index, + if (end_index >= start_index) { + unsigned long count = invalidate_mapping_pages(mapping, + start_index, end_index); + + /* + * If fewer pages were invalidated than expected then + * it is possible that some of the pages were on + * a per-cpu pagevec for a remote CPU. Drain all + * pagevecs and try again. + */ + if (count < (end_index - start_index + 1)) { + lru_add_drain_all(); + invalidate_mapping_pages(mapping, start_index, end_index); + } + } break; default: ret = -EINVAL; } out: - fput(file); + fdput(f); return ret; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) -{ - return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); -} -SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); -#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) { return sys_fadvise64_64(fd, offset, len, advice); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) -{ - return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); -} -SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); -#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14e..088358c8006b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,13 +29,16 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/security.h> -#include <linux/syscalls.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/filemap.h> + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ @@ -74,7 +77,7 @@ * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -105,15 +108,79 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} + /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; + trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -124,10 +191,11 @@ void __delete_from_page_cache(struct page *page) else cleancache_invalidate_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); @@ -163,7 +231,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -185,6 +253,19 @@ static int sleep_on_page_killable(void *word) return fatal_signal_pending(current) ? -EINTR : 0; } +static int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -266,10 +347,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; + int ret2, ret = 0; if (end_byte < start_byte) - return 0; + goto out; pagevec_init(&pvec, 0); while ((index <= end) && @@ -292,12 +373,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, pagevec_release(&pvec); cond_resched(); } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; +out: + ret2 = filemap_check_errors(mapping); + if (!ret) + ret = ret2; return ret; } @@ -338,6 +417,8 @@ int filemap_write_and_wait(struct address_space *mapping) if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -369,6 +450,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -393,9 +476,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { int error; - VM_BUG_ON(!PageLocked(old)); - VM_BUG_ON(!PageLocked(new)); - VM_BUG_ON(new->mapping); + VM_BUG_ON_PAGE(!PageLocked(old), old); + VM_BUG_ON_PAGE(!PageLocked(new), new); + VM_BUG_ON_PAGE(new->mapping, new); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (!error) { @@ -410,7 +493,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -430,6 +513,91 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) +{ + int error; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapBacked(page), page); + + error = mem_cgroup_charge_file(page, current->mm, + gfp_mask & GFP_RECLAIM_MASK); + if (error) + return error; + + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error) { + mem_cgroup_uncharge_cache_page(page); + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = page_cache_tree_insert(mapping, page, shadowp); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + return error; +} + /** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add @@ -443,51 +611,35 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error; - - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); - - error = mem_cgroup_cache_charge(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); - if (error) - goto out; - - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error == 0) { - page_cache_get(page); - page->mapping = mapping; - page->index = offset; - - spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (likely(!error)) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); - page_cache_release(page); - } - radix_tree_preload_end(); - } else - mem_cgroup_uncharge_cache_page(page); -out: - return error; + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); } EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { + void *shadow = NULL; int ret; - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add_file(page); + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -501,10 +653,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } @@ -588,7 +740,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); @@ -667,14 +819,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -691,9 +930,9 @@ repeat: if (radix_tree_deref_retry(page)) goto repeat; /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto out; } @@ -715,24 +954,50 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_get_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} +EXPORT_SYMBOL(find_get_page); + +/** + * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. + */ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -741,10 +1006,33 @@ repeat: page_cache_release(page); goto repeat; } - VM_BUG_ON(page->index != offset); + VM_BUG_ON_PAGE(page->index != offset, page); } return page; } +EXPORT_SYMBOL(find_lock_entry); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * Otherwise, %NULL is returned. + * + * find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_lock_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} EXPORT_SYMBOL(find_lock_page); /** @@ -753,16 +1041,18 @@ EXPORT_SYMBOL(find_lock_page); * @index: the page's index into the mapping * @gfp_mask: page allocation mode * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * If the page is not present, a new page is allocated using @gfp_mask + * and added to the page cache and the VM's LRU list. The page is + * returned locked and with an increased refcount. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * On memory exhaustion, %NULL is returned. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an + * atomic allocation! */ struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) @@ -795,6 +1085,76 @@ repeat: EXPORT_SYMBOL(find_or_create_page); /** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} + +/** * find_get_pages - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index @@ -840,9 +1200,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so skip over it - - * we only reach this from invalidate_mapping_pages(). + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. */ continue; } @@ -907,9 +1267,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so stop looking for - * contiguous pages. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. */ break; } @@ -983,10 +1343,17 @@ repeat: goto restart; } /* - * This function is never used on a shmem/tmpfs - * mapping, so a swap entry won't be found here. + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. */ - BUG(); + continue; } if (!page_cache_get_speculative(page)) @@ -1070,8 +1437,8 @@ static void shrink_readahead_size_eio(struct file *filp, * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor - * @actor: read method + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1079,8 +1446,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc, read_actor_t actor) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1090,12 +1457,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -1130,7 +1497,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - desc, offset)) + offset, iter->count)) goto page_not_up_to_date_locked; unlock_page(page); } @@ -1180,23 +1547,23 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ @@ -1231,6 +1598,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; @@ -1261,7 +1629,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1272,16 +1639,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1294,44 +1662,7 @@ out: *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); -} - -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; + return written ? written : error; } /* @@ -1389,14 +1720,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct iov_iter i; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + iov_iter_init(&i, iov, nr_segs, count, 0); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1409,113 +1741,42 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; /* skip atime */ size = i_size_read(inode); - if (pos < size) { - retval = filemap_write_and_wait_range(mapping, pos, + retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); - if (!retval) { - struct blk_plug plug; - - blk_start_plug(&plug); - retval = mapping->a_ops->direct_IO(READ, iocb, - iov, pos, nr_segs); - blk_finish_plug(&plug); - } - if (retval > 0) { - *ppos = pos + retval; - count -= retval; - } - + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } + if (retval > 0) { + *ppos = pos + retval; + count -= retval; /* - * Btrfs can have a short DIO read if we encounter - * compressed extents, so if there was an error, or if - * we've already read everything we wanted to, or if - * there was a short read because we hit EOF, go ahead - * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * If we did a short DIO read we need to skip the + * section of the iov that we've already read data into. */ - if (retval < 0 || !count || *ppos >= size) { - file_accessed(filp); - goto out; - } + iov_iter_advance(&i, retval); } - } - - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; + if (retval < 0 || !count || *ppos >= size) { + file_accessed(filp); + goto out; } - if (desc.count > 0) - break; } + + retval = do_generic_file_read(filp, ppos, &i, retval); out: return retval; } EXPORT_SYMBOL(generic_file_aio_read); -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) - return -EINVAL; - - force_page_cache_readahead(mapping, filp, index, nr); - return 0; -} - -SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) -{ - ssize_t ret; - struct file *file; - - ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); - } - fput(file); - } - return ret; -} -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_readahead(long fd, loff_t offset, long count) -{ - return SYSC_readahead((int) fd, offset, (size_t) count); -} -SYSCALL_ALIAS(sys_readahead, SyS_readahead); -#endif - #ifdef CONFIG_MMU /** * page_cache_read - adds requested page to the page cache if not already there @@ -1564,12 +1825,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma)) { + if (vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1609,7 +1870,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; @@ -1639,24 +1900,24 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (offset >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; /* * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); - if (likely(page)) { + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* * We found the page, so try async readahead before * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - } else { + } else if (!page) { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); @@ -1679,7 +1940,7 @@ retry_find: put_page(page); goto retry_find; } - VM_BUG_ON(page->index != offset); + VM_BUG_ON_PAGE(page->index != offset, page); /* * We have a locked page in the page cache, now we need to check @@ -1692,8 +1953,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(offset >= size)) { + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; @@ -1751,8 +2012,110 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + +int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(filemap_page_mkwrite); + const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ @@ -1765,7 +2128,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1792,6 +2154,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1818,6 +2192,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1854,6 +2230,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1861,40 +2241,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1913,240 +2278,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = dentry->d_inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -static int __remove_suid(struct dentry *dentry, int kill) -{ - struct iattr newattrs; - - newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); -} - -int file_remove_suid(struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - int killsuid; - int killpriv; - int error = 0; - - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) - return 0; - - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; - - return error; -} -EXPORT_SYMBOL(file_remove_suid); - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * @@ -2253,7 +2388,7 @@ EXPORT_SYMBOL(pagecache_write_end); ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, + unsigned long *nr_segs, loff_t pos, size_t count, size_t ocount) { struct file *file = iocb->ki_filp; @@ -2314,7 +2449,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = pos; + iocb->ki_pos = pos; } out: return written; @@ -2355,12 +2490,12 @@ repeat: return NULL; } found: - wait_on_page_writeback(page); + wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2410,9 +2545,7 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); @@ -2450,34 +2583,13 @@ again: return written ? written : status; } - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - *ppos = pos + status; - } - - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); +EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_aio_write - write data to a file * @iocb: IO state structure (file, offset, etc.) * @iov: vector with data to write * @nr_segs: number of segments in the vector - * @ppos: position where to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -2492,16 +2604,18 @@ EXPORT_SYMBOL(generic_file_buffered_write); * avoid syncing under i_mutex. */ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) + unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; - ssize_t written; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t err; + ssize_t status; + struct iov_iter from; ocount = 0; err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -2509,14 +2623,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - pos = *ppos; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2528,47 +2637,51 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; + + iov_iter_init(&from, iov, nr_segs, count, 0); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos, + count, ocount); if (written < 0 || written == count) goto out; + iov_iter_advance(&from, written); + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + + status = generic_perform_write(file, &from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0) && !written) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; + endbyte = pos + status - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2579,8 +2692,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; @@ -2604,24 +2718,21 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } - blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a4eb31132229..d8d9fe3f685c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -26,7 +26,7 @@ * of ZERO_PAGE(), such as /dev/zero */ static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO; +static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); static struct page *__xip_sparse_page; /* called under xip_sparse_mutex */ @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); @@ -193,11 +192,13 @@ retry: if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } @@ -304,6 +305,8 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, + .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -312,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; + vma->vm_flags |= VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); @@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, pos = *ppos; count = len; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -426,7 +427,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (ret) goto out_backing; - file_update_time(filp); + ret = file_update_time(filp); + if (ret) + goto out_backing; ret = __xip_file_write (filp, buf, count, pos, ppos); diff --git a/mm/fremap.c b/mm/fremap.c index 9ed4fd432467..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -5,6 +5,7 @@ * * started by Ingo Molnar, Copyright (C) 2002, 2003 */ +#include <linux/export.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/swap.h> @@ -22,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } @@ -56,17 +73,22 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a @@ -80,9 +102,10 @@ out: return err; } -static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) { + struct mm_struct *mm = vma->vm_mm; int err; do { @@ -95,9 +118,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, pgoff++; } while (size); - return 0; - + return 0; } +EXPORT_SYMBOL(generic_file_remap_pages); /** * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma @@ -127,6 +150,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; + vm_flags_t vm_flags = 0; if (prot) return err; @@ -158,16 +182,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within - * the single existing vma. vm_private_data is used as a - * swapout cursor in a VM_NONLINEAR vma. + * the single existing vma. */ if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) - goto out; - - if (!(vma->vm_flags & VM_CAN_NONLINEAR)) + if (!vma->vm_ops || !vma->vm_ops->remap_pages) goto out; if (start < vma->vm_start || start + size > vma->vm_end) @@ -175,6 +195,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* Must set VM_NONLINEAR before any pages are populated. */ if (!(vma->vm_flags & VM_NONLINEAR)) { + /* + * vm_private_data is used as a swapout cursor + * in a VM_NONLINEAR vma. + */ + if (vma->vm_private_data) + goto out; + /* Don't need a nonlinear mapping, exit success */ if (pgoff == linear_page_index(vma, start)) { err = 0; @@ -182,6 +209,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } if (!has_write_lock) { +get_write_lock: up_read(&mm->mmap_sem); down_write(&mm->mmap_sem); has_write_lock = 1; @@ -195,12 +223,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; - struct file *file = vma->vm_file; + struct file *file = get_file(vma->vm_file); + /* mmap_region may free vma; grab the info now */ + vm_flags = vma->vm_flags; - flags &= MAP_NONBLOCK; - get_file(file); - addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff); + addr = mmap_region(file, start, size, vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; @@ -208,12 +235,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, BUG_ON(addr != start); err = 0; } - goto out; + goto out_freed; } mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); @@ -223,28 +250,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * drop PG_Mlocked flag for over-mapped range */ - vm_flags_t saved_flags = vma->vm_flags; + if (!has_write_lock) + goto get_write_lock; + vm_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); - vma->vm_flags = saved_flags; + vma->vm_flags = vm_flags; } mmu_notifier_invalidate_range_start(mm, start, start + size); - err = populate_range(mm, vma, start, size, pgoff); + err = vma->vm_ops->remap_pages(vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); - if (!err && !(flags & MAP_NONBLOCK)) { - if (vma->vm_flags & VM_LOCKED) { - /* - * might be mapping previously unmapped range of file - */ - mlock_vma_pages_range(vma, start, start + size); - } else { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; - } - make_pages_present(start, start+size); - } - } /* * We can't clear VM_NONLINEAR because we'd have to do @@ -253,10 +268,15 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, */ out: + if (vma) + vm_flags = vma->vm_flags; +out_freed: if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else up_write(&mm->mmap_sem); + if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) + mm_populate(start, size); return err; } diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..1b24bdcb3197 --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,460 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/security.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops *frontswap_ops __read_mostly; + +/* + * If enabled, frontswap_store will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +/* + * If enabled, the underlying tmem implementation is capable of doing + * exclusive gets, so frontswap_load, on a successful tmem_get must + * mark the page as no longer in frontswap AND mark it dirty. + */ +static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) { + frontswap_loads++; +} +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; +} +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif + +/* + * Due to the asynchronous nature of the backends loading potentially + * _after_ the swap system has been activated, we have chokepoints + * on all frontswap functions to not call the backend until the backend + * has registered. + * + * Specifically when no backend is registered (nobody called + * frontswap_register_ops) all calls to frontswap_init (which is done via + * swapon -> enable_swap_info -> frontswap_init) are registered and remembered + * (via the setting of need_init bitmap) but fail to create tmem_pools. When a + * backend registers with frontswap at some later point the previous + * calls to frontswap_init are executed (by iterating over the need_init + * bitmap) to create tmem_pools and set the respective poolids. All of that is + * guarded by us using atomic bit operations on the 'need_init' bitmap. + * + * This would not guards us against the user deciding to call swapoff right as + * we are calling the backend to initialize (so swapon is in action). + * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * OK. The other scenario where calls to frontswap_store (called via + * swap_writepage) is racing with frontswap_invalidate_area (called via + * swapoff) is again guarded by the swap subsystem. + * + * While no backend is registered all calls to frontswap_[store|load| + * invalidate_area|invalidate_page] are ignored or fail. + * + * The time between the backend being registered and the swap file system + * calling the backend (via the frontswap_* functions) is indeterminate as + * frontswap_ops is not atomic_t (or a value guarded by a spinlock). + * That is OK as we are comfortable missing some of these calls to the newly + * registered backend. + * + * Obviously the opposite (unloading the backend) must be done after all + * the frontswap_[store|load|invalidate_area|invalidate_page] start + * ignorning or failing the requests - at which point frontswap_ops + * would have to be made in some fashion atomic. + */ +static DECLARE_BITMAP(need_init, MAX_SWAPFILES); + +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops *old = frontswap_ops; + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (test_and_clear_bit(i, need_init)) { + struct swap_info_struct *sis = swap_info[i]; + /* __frontswap_init _should_ have set it! */ + if (!sis->frontswap_map) + return ERR_PTR(-EINVAL); + ops->init(i); + } + } + /* + * We MUST have frontswap_ops set _after_ the frontswap_init's + * have been called. Otherwise __frontswap_store might fail. Hence + * the barrier to make sure compiler does not re-order us. + */ + barrier(); + frontswap_ops = ops; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Enable/disable frontswap exclusive gets (see above). + */ +void frontswap_tmem_exclusive_gets(bool enable) +{ + frontswap_tmem_exclusive_gets_enabled = enable; +} +EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type, unsigned long *map) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + + /* + * p->frontswap is a bitmap that we MUST have to figure out which page + * has gone in frontswap. Without it there is no point of continuing. + */ + if (WARN_ON(!map)) + return; + /* + * Irregardless of whether the frontswap backend has been loaded + * before this function or it will be later, we _MUST_ have the + * p->frontswap set to something valid to work properly. + */ + frontswap_map_set(sis, map); + if (frontswap_ops) + frontswap_ops->init(type); + else { + BUG_ON(type > MAX_SWAPFILES); + set_bit(type, need_init); + } +} +EXPORT_SYMBOL(__frontswap_init); + +bool __frontswap_test(struct swap_info_struct *sis, + pgoff_t offset) +{ + bool ret = false; + + if (frontswap_ops && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} +EXPORT_SYMBOL(__frontswap_test); + +static inline void __frontswap_clear(struct swap_info_struct *sis, + pgoff_t offset) +{ + clear_bit(offset, sis->frontswap_map); + atomic_dec(&sis->frontswap_pages); +} + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implementation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + /* + * Return if no backend registed. + * Don't need to inc frontswap_failed_stores here. + */ + if (!frontswap_ops) + return ret; + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (__frontswap_test(sis, offset)) + dup = 1; + ret = frontswap_ops->store(type, offset, page); + if (ret == 0) { + set_bit(offset, sis->frontswap_map); + inc_frontswap_succ_stores(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + inc_frontswap_failed_stores(); + if (dup) + __frontswap_clear(sis, offset); + } + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_store); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) + ret = frontswap_ops->load(type, offset, page); + if (ret == 0) { + inc_frontswap_loads(); + if (frontswap_tmem_exclusive_gets_enabled) { + SetPageDirty(page); + __frontswap_clear(sis, offset); + } + } + return ret; +} +EXPORT_SYMBOL(__frontswap_load); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) { + frontswap_ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + if (frontswap_ops) { + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); + } + clear_bit(type, need_init); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +static unsigned long __frontswap_curr_pages(void) +{ + int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + totalpages += atomic_read(&si->frontswap_pages); + } + return totalpages; +} + +static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + int *swapid) +{ + int ret = -EINVAL; + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; + int type; + + assert_spin_locked(&swap_lock); + for (type = swap_list.head; type >= 0; type = si->next) { + si = swap_info[type]; + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; + } else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) { + ret = -ENOMEM; + continue; + } + vm_unacct_memory(pages); + *unused = pages_to_unuse; + *swapid = type; + ret = 0; + break; + } + + return ret; +} + +/* + * Used to check if it's necessory and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shink pages, + * error code when there is an error. + */ +static int __frontswap_shrink(unsigned long target_pages, + unsigned long *pages_to_unuse, + int *type) +{ + unsigned long total_pages = 0, total_pages_to_unuse; + + assert_spin_locked(&swap_lock); + + total_pages = __frontswap_curr_pages(); + if (total_pages <= target_pages) { + /* Nothing to do */ + *pages_to_unuse = 0; + return 1; + } + total_pages_to_unuse = total_pages - target_pages; + return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + unsigned long pages_to_unuse = 0; + int uninitialized_var(type), ret; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_list.head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); + spin_unlock(&swap_lock); + if (ret == 0) + try_to_unuse(type, true, pages_to_unuse); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + unsigned long totalpages = 0; + + spin_lock(&swap_lock); + totalpages = __frontswap_curr_pages(); + spin_unlock(&swap_lock); + + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6250c3..b32b70cdaed6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -94,6 +94,19 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif +struct page *kmap_to_page(void *vaddr) +{ + unsigned long addr = (unsigned long)vaddr; + + if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { + int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); + } + + return virt_to_page(addr); +} +EXPORT_SYMBOL(kmap_to_page); + static void flush_all_zero_pkmaps(void) { int i; @@ -125,8 +138,7 @@ static void flush_all_zero_pkmaps(void) * So no dangers, even with speculative execution. */ page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); + pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); need_flush = 1; @@ -312,11 +324,7 @@ struct page_address_map { struct list_head list; }; -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ +static struct page_address_map page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -381,14 +389,7 @@ void set_page_address(struct page *page, void *virtual) pas = page_slot(page); if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - spin_lock_irqsave(&pool_lock, flags); - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); - + pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; pam->page = page; pam->virtual = virtual; @@ -401,9 +402,6 @@ void set_page_address(struct page *page, void *virtual) if (pam->page == page) { list_del(&pam->list); spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); - list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); goto done; } } @@ -413,20 +411,14 @@ done: return; } -static struct page_address_map page_address_maps[LAST_PKMAP]; - void __init page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); - for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..b4b1feba6472 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,21 +12,27 @@ #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> +#include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/migrate.h> +#include <linux/hashtable.h> + #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" /* - * By default transparent hugepage support is enabled for all mappings - * and khugepaged scans all mappings. Defrag is only invoked by - * khugepaged hugepage allocations and by page faults inside - * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived - * allocations. + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS @@ -36,7 +42,8 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; @@ -57,12 +64,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); -static int mm_slots_hash_init(void); static int khugepaged_slab_init(void); -static void khugepaged_slab_free(void); -#define MM_SLOTS_HASH_HEADS 1024 -static struct hlist_head *mm_slots_hash __read_mostly; +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + static struct kmem_cache *mm_slot_cache __read_mostly; /** @@ -100,12 +106,8 @@ static int set_recommended_min_free_kbytes(void) struct zone *zone; int nr_zones = 0; unsigned long recommended_min; - extern int min_free_kbytes; - if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) && - !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags)) + if (!khugepaged_enabled()) return 0; for_each_populated_zone(zone) @@ -128,8 +130,14 @@ static int set_recommended_min_free_kbytes(void) (unsigned long) nr_free_buffer_pages() / 20); recommended_min <<= (PAGE_SHIFT-10); - if (recommended_min > min_free_kbytes) + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu " + "to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + min_free_kbytes = recommended_min; + } setup_per_zone_wmarks(); return 0; } @@ -139,12 +147,6 @@ static int start_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { - int wakeup; - if (unlikely(!mm_slot_cache || !mm_slots_hash)) { - err = -ENOMEM; - goto out; - } - mutex_lock(&khugepaged_mutex); if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -154,19 +156,94 @@ static int start_khugepaged(void) err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } - wakeup = !list_empty(&khugepaged_scan.mm_head); - mutex_unlock(&khugepaged_mutex); - if (wakeup) + + if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); set_recommended_min_free_kbytes(); - } else - /* wakeup to exit */ - wake_up_interruptible(&khugepaged_wait); -out: + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } + return err; } +static atomic_t huge_zero_refcount; +static struct page *huge_zero_page __read_mostly; + +static inline bool is_huge_zero_page(struct page *page) +{ + return ACCESS_ONCE(huge_zero_page) == page; +} + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_page(pmd_page(pmd)); +} + +static struct page *get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return ACCESS_ONCE(huge_zero_page); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); + return NULL; + } + count_vm_event(THP_ZERO_PAGE_ALLOC); + preempt_disable(); + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + preempt_enable(); + __free_page(zero_page); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return ACCESS_ONCE(huge_zero_page); +} + +static void put_huge_zero_page(void) +{ + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); +} + +static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; +} + +static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + __free_page(zero_page); + return HPAGE_PMD_NR; + } + + return 0; +} + +static struct shrinker huge_zero_page_shrinker = { + .count_objects = shrink_huge_zero_page_count, + .scan_objects = shrink_huge_zero_page_scan, + .seeks = DEFAULT_SEEKS, +}; + #ifdef CONFIG_SYSFS static ssize_t double_flag_show(struct kobject *kobj, @@ -224,18 +301,16 @@ static ssize_t enabled_store(struct kobject *kobj, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); if (ret > 0) { - int err = start_khugepaged(); + int err; + + mutex_lock(&khugepaged_mutex); + err = start_khugepaged(); + mutex_unlock(&khugepaged_mutex); + if (err) ret = err; } - if (ret > 0 && - (test_bit(TRANSPARENT_HUGEPAGE_FLAG, - &transparent_hugepage_flags) || - test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - &transparent_hugepage_flags))) - set_recommended_min_free_kbytes(); - return ret; } static struct kobj_attribute enabled_attr = @@ -294,6 +369,20 @@ static ssize_t defrag_store(struct kobject *kobj, static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); +static ssize_t use_zero_page_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static ssize_t use_zero_page_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static struct kobj_attribute use_zero_page_attr = + __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -315,6 +404,7 @@ static struct kobj_attribute debug_cow_attr = static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, + &use_zero_page_attr.attr, #ifdef CONFIG_DEBUG_VM &debug_cow_attr.attr, #endif @@ -339,7 +429,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -366,7 +456,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -392,7 +482,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long pages; - err = strict_strtoul(buf, 10, &pages); + err = kstrtoul(buf, 10, &pages); if (err || !pages || pages > UINT_MAX) return -EINVAL; @@ -460,7 +550,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, int err; unsigned long max_ptes_none; - err = strict_strtoul(buf, 10, &max_ptes_none); + err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR-1) return -EINVAL; @@ -494,19 +584,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed kobject create\n"); + printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed register hugeage group\n"); + printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -554,11 +644,7 @@ static int __init hugepage_init(void) if (err) goto out; - err = mm_slots_hash_init(); - if (err) { - khugepaged_slab_free(); - goto out; - } + register_shrinker(&huge_zero_page_shrinker); /* * By default disable transparent hugepages on smaller systems, @@ -570,14 +656,12 @@ static int __init hugepage_init(void) start_khugepaged(); - set_recommended_min_free_kbytes(); - return 0; out: hugepage_exit_sysfs(hugepage_kobj); return err; } -module_init(hugepage_init) +subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { @@ -611,71 +695,61 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static void prepare_pmd_huge_pte(pgtable_t pgtable, - struct mm_struct *mm) -{ - assert_spin_locked(&mm->page_table_lock); - - /* FIFO */ - if (!mm->pmd_huge_pte) - INIT_LIST_HEAD(&pgtable->lru); - else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; -} - -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); return pmd; } +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) +{ + pmd_t entry; + entry = mk_pmd(page, prot); + entry = pmd_mkhuge(entry); + return entry; +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *page) { - int ret = 0; pgtable_t pgtable; + spinlock_t *ptl; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) { - mem_cgroup_uncharge_page(page); - put_page(page); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - } clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); - /* - * The spinlocking to take the lru_lock inside - * page_add_new_anon_rmap() acts as a full memory - * barrier to be sure clear_huge_page writes become - * visible after the set_pmd_at() write. - */ page_add_new_anon_rmap(page, vma, haddr); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - prepare_pmd_huge_pte(pgtable, mm); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; - spin_unlock(&mm->page_table_lock); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); } - return ret; + return 0; } static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) @@ -692,13 +766,22 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) +/* Caller must hold page table lock. */ +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + struct page *zero_page) { - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); + pmd_t entry; + if (!pmd_none(*pmd)) + return false; + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_wrprotect(entry); + entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); + atomic_long_inc(&mm->nr_ptes); + return true; } -#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -706,52 +789,65 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; - pte_t *pte; - if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && + transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; + pgtable_t pgtable; + struct page *zero_page; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - if (unlikely(!page)) { + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { + pte_free(mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); - goto out; + return VM_FAULT_FALLBACK; } - count_vm_event(THP_FAULT_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { - put_page(page); - goto out; + ptl = pmd_lock(mm, pmd); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_page); + spin_unlock(ptl); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); } - - return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); - } -out: - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + } + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + + count_vm_event(THP_FAULT_ALLOC); + return 0; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -762,8 +858,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -771,52 +868,77 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_free(dst_mm, pgtable); goto out_unlock; } + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + struct page *zero_page; + bool set; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_page = get_huge_zero_page(); + set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_page); + BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ + ret = 0; + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ goto out; } src_page = pmd_page(pmd); - VM_BUG_ON(!PageHead(src_page)); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); get_page(src_page); page_dup_rmap(src_page); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - prepare_pmd_huge_pte(pgtable, dst_mm); - dst_mm->nr_ptes++; + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } -/* no "address" argument so destroys page coloring of some arch */ -pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) { - pgtable_t pgtable; + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; - assert_spin_locked(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; - /* FIFO */ - pgtable = mm->pmd_huge_pte; - if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; - else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, - struct page, lru); - list_del(&pgtable->lru); - } - return pgtable; + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); } static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, @@ -826,10 +948,13 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, GFP_KERNEL); @@ -843,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -866,15 +991,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, cond_resched(); } - spin_lock(&mm->page_table_lock); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -892,7 +1021,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -901,7 +1032,8 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { mem_cgroup_uncharge_page(pages[i]); @@ -915,30 +1047,36 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; - struct page *page, *new_page; + struct page *page = NULL, *new_page; unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); - spin_lock(&mm->page_table_lock); + haddr = address & HPAGE_PMD_MASK; + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); - VM_BUG_ON(!PageCompound(page) || !PageHead(page)); - haddr = address & HPAGE_PMD_MASK; + VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); if (page_mapcount(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache_pmd(vma, address, pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(&mm->page_table_lock); - + spin_unlock(ptl); +alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), @@ -947,63 +1085,105 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = NULL; if (unlikely(!new_page)) { + if (!page) { + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + } else { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) { + split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } + put_page(page); + } count_vm_event(THP_FAULT_FALLBACK); - ret = do_huge_pmd_wp_page_fallback(mm, vma, address, - pmd, orig_pmd, page, haddr); - put_page(page); goto out; } - count_vm_event(THP_FAULT_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); - put_page(page); - ret |= VM_FAULT_OOM; + if (page) { + split_huge_page(page); + put_page(page); + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + count_vm_event(THP_FAULT_FALLBACK); goto out; } - copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + + if (!page) + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - spin_lock(&mm->page_table_lock); - put_page(page); + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + spin_lock(ptl); + if (page) + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); + goto out_mn; } else { pmd_t entry; - VM_BUG_ON(!PageHead(page)); - entry = mk_pmd(new_page, vma->vm_page_prot); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); - update_mmu_cache(vma, address, entry); - page_remove_rmap(page); - put_page(page); + update_mmu_cache_pmd(vma, address, pmd); + if (!page) { + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + put_huge_zero_page(); + } else { + VM_BUG_ON_PAGE(!PageHead(page), page); + page_remove_rmap(page); + put_page(page); + } ret |= VM_FAULT_WRITE; } -out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; +out_unlock: + spin_unlock(ptl); + return ret; } -struct page *follow_trans_huge_pmd(struct mm_struct *mm, +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { + struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto out; + page = pmd_page(*pmd); - VM_BUG_ON(!PageHead(page)); + VM_BUG_ON_PAGE(!PageHead(page), page); if (flags & FOLL_TOUCH) { pmd_t _pmd; /* @@ -1015,10 +1195,20 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } } page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON(!PageCompound(page)); + VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) get_page_foll(page); @@ -1026,25 +1216,163 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + spinlock_t *ptl; + struct anon_vma *anon_vma = NULL; + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); + int target_nid, last_cpupid = -1; + bool page_locked; + bool migrated = false; + int flags = 0; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + spin_unlock(ptl); + wait_migrate_huge_page(vma->anon_vma, pmdp); + goto out; + } + + page = pmd_page(pmd); + BUG_ON(is_huge_zero_page(page)); + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == this_nid) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } + + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pmd_write(pmd)) + flags |= TNF_NO_GROUP; + + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) + goto clear_pmdnuma; + } + + /* Migration could have started since the pmd_trans_migrating check */ + if (!page_locked) { + spin_unlock(ptl); + wait_on_page_locked(page); + page_nid = -1; + goto out; + } + + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ + get_page(page); + spin_unlock(ptl); + anon_vma = page_lock_anon_vma_read(page); + + /* Confirm the PMD did not change while page_table_lock was released */ + spin_lock(ptl); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); + page_nid = -1; + goto out_unlock; + } + + /* Bail if we fail to protect against THP splits for any reason */ + if (unlikely(!anon_vma)) { + put_page(page); + page_nid = -1; + goto clear_pmdnuma; + } + + /* + * Migrate the THP to the requested node, returns with page unlocked + * and pmd_numa cleared. + */ + spin_unlock(ptl); + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, page, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } + + goto out; +clear_pmdnuma: + BUG_ON(!PageLocked(page)); + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); +out_unlock: + spin_unlock(ptl); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; - pgtable = get_pmd_huge_pte(tlb->mm); - page = pmd_page(*pmd); - pmd_clear(pmd); + pmd_t orig_pmd; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - page_remove_rmap(page); - VM_BUG_ON(page_mapcount(page) < 0); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; - spin_unlock(&tlb->mm->page_table_lock); - tlb_remove_page(tlb, page); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + if (is_huge_zero_pmd(orig_pmd)) { + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); + } else { + page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON_PAGE(!PageHead(page), page); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + tlb_remove_page(tlb, page); + } pte_free(tlb->mm, pgtable); ret = 1; } @@ -1055,14 +1383,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1075,6 +1404,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1095,30 +1425,72 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); - spin_unlock(&mm->page_table_lock); + + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); } out: return ret; } +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * - HPAGE_PMD_NR is protections changed and TLB flush necessary + */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; - entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; + if (!prot_numa) { + entry = pmdp_get_and_clear(mm, addr, pmd); + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); + entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(pmd_write(entry)); + } else { + struct page *page = pmd_page(*pmd); + + /* + * Do not trap faults against the zero page. The + * read-only data is likely to be read-cached on the + * local CPU cache and it is less useful to know about + * local vs remote hits on the zero page. + */ + if (!is_huge_zero_page(page) && + !pmd_numa(*pmd)) { + pmdp_set_numa(mm, addr, pmd); + ret = HPAGE_PMD_NR; + } + } + spin_unlock(ptl); } return ret; @@ -1131,12 +1503,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1145,35 +1518,44 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; - + return NULL; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - goto out; - + return NULL; pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; + + *ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1183,14 +1565,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1198,36 +1581,45 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; - spin_lock(&mm->page_table_lock); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ - pmdp_splitting_flush_notify(vma, address, pmd); + pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; } -static void __split_huge_page_refcount(struct page *page) +static void __split_huge_page_refcount(struct page *page, + struct list_head *list) { int i; struct zone *zone = page_zone(page); + struct lruvec *lruvec; int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + compound_lock(page); /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(page); @@ -1270,7 +1662,9 @@ static void __split_huge_page_refcount(struct page *page) ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_mlocked) | - (1L << PG_uptodate))); + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); /* clear PageTail before overwriting first_page */ @@ -1296,20 +1690,19 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); - - lru_add_page_tail(zone, page, page_tail); + lru_add_page_tail(page, page_tail, lruvec, list); } atomic_sub(tail_count, &page->_count); BUG_ON(atomic_read(&page->_count) <= 0); - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); ClearPageCompound(page); compound_unlock(page); @@ -1340,20 +1733,20 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { - pgtable = get_pmd_huge_pte(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); - for (i = 0, haddr = address; i < HPAGE_PMD_NR; - i++, haddr += PAGE_SIZE) { + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; BUG_ON(PageCompound(page+i)); entry = mk_pte(page + i, vma->vm_page_prot); @@ -1364,6 +1757,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1397,33 +1792,32 @@ static int __split_huge_page_map(struct page *page, * SMP TLB and finally we write the non-huge version * of the pmd entry with pmd_populate. */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma) + struct anon_vma *anon_vma, + struct list_head *list) { int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct anon_vma_chain *avc; BUG_ON(!PageHead(page)); BUG_ON(PageTail(page)); mapcount = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount += __split_huge_page_splitting(page, vma, addr); } /* @@ -1441,15 +1835,13 @@ static void __split_huge_page(struct page *page, mapcount, page_mapcount(page)); BUG_ON(mapcount != page_mapcount(page)); - __split_huge_page_refcount(page); + __split_huge_page_refcount(page, list); mapcount2 = 0; - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long addr = vma_address(page, vma); BUG_ON(is_vma_temporary_stack(vma)); - if (addr == -EFAULT) - continue; mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) @@ -1458,38 +1850,65 @@ static void __split_huge_page(struct page *page, BUG_ON(mapcount != mapcount2); } -int split_huge_page(struct page *page) +/* + * Split a hugepage into normal pages. This doesn't change the position of head + * page. If @list is null, tail pages will be added to LRU list, otherwise, to + * @list. Both head page and tail pages will inherit mapping, flags, and so on + * from the hugepage. + * Return 0 if the hugepage is split successfully otherwise return 1. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) { struct anon_vma *anon_vma; int ret = 1; + BUG_ON(is_huge_zero_page(page)); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); if (!anon_vma) goto out; + anon_vma_lock_write(anon_vma); + ret = 0; if (!PageCompound(page)) goto out_unlock; BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma); + __split_huge_page(page, anon_vma, list); count_vm_event(THP_SPLIT); BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ - VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ @@ -1535,12 +1954,6 @@ static int __init khugepaged_slab_init(void) return 0; } -static void __init khugepaged_slab_free(void) -{ - kmem_cache_destroy(mm_slot_cache); - mm_slot_cache = NULL; -} - static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -1553,47 +1966,22 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) kmem_cache_free(mm_slot_cache, mm_slot); } -static int __init mm_slots_hash_init(void) -{ - mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!mm_slots_hash) - return -ENOMEM; - return 0; -} - -#if 0 -static void __init mm_slots_hash_free(void) -{ - kfree(mm_slots_hash); - mm_slots_hash = NULL; -} -#endif - static struct mm_slot *get_mm_slot(struct mm_struct *mm) { struct mm_slot *mm_slot; - struct hlist_head *bucket; - struct hlist_node *node; - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; - hlist_for_each_entry(mm_slot, node, bucket, hash) { + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) if (mm == mm_slot->mm) return mm_slot; - } + return NULL; } static void insert_to_mm_slots_hash(struct mm_struct *mm, struct mm_slot *mm_slot) { - struct hlist_head *bucket; - - bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) - % MM_SLOTS_HASH_HEADS]; mm_slot->mm = mm; - hlist_add_head(&mm_slot->hash, bucket); + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } static inline int khugepaged_test_exit(struct mm_struct *mm) @@ -1646,11 +2034,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + VM_BUG_ON(vma->vm_flags & VM_NO_THP); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -1666,7 +2050,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_lock(&khugepaged_mm_lock); mm_slot = get_mm_slot(mm); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); free = 1; } @@ -1707,82 +2091,66 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) } } -static void release_all_pte_pages(pte_t *pte) -{ - release_pte_pages(pte, pte + HPAGE_PMD_NR); -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { struct page *page; pte_t *_pte; - int referenced = 0, isolated = 0, none = 0; + int referenced = 0, none = 0; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval)) { if (++none <= khugepaged_max_ptes_none) continue; - else { - release_pte_pages(pte, _pte); + else goto out; - } } - if (!pte_present(pteval) || !pte_write(pteval)) { - release_pte_pages(pte, _pte); + if (!pte_present(pteval) || !pte_write(pteval)) goto out; - } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - release_pte_pages(pte, _pte); + if (unlikely(!page)) goto out; - } - VM_BUG_ON(PageCompound(page)); - BUG_ON(!PageAnon(page)); - VM_BUG_ON(!PageSwapBacked(page)); + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) { - release_pte_pages(pte, _pte); + if (page_count(page) != 1) goto out; - } /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { - release_pte_pages(pte, _pte); + if (!trylock_page(page)) goto out; - } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (isolate_lru_page(page)) { unlock_page(page); - release_pte_pages(pte, _pte); goto out; } /* 0 stands for page_is_file_cache(page) == false */ inc_zone_page_state(page, NR_ISOLATED_ANON + 0); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ if (pte_young(pteval) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } - if (unlikely(!referenced)) - release_all_pte_pages(pte); - else - isolated = 1; + if (likely(referenced)) + return 1; out: - return isolated; + release_pte_pages(pte, _pte); + return 0; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -1801,8 +2169,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON(page_mapcount(src_page) != 1); - VM_BUG_ON(page_count(src_page) != 2); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to @@ -1825,29 +2192,63 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - struct vm_area_struct *vma, - int node) +static void khugepaged_alloc_sleep(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd, _pmd; - pte_t *pte; - pgtable_t pgtable; - struct page *new_page; - spinlock_t *ptl; - int isolated; - unsigned long hstart, hend; + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#ifndef CONFIG_NUMA - up_read(&mm->mmap_sem); - VM_BUG_ON(!*hpage); - new_page = *hpage; -#else - VM_BUG_ON(*hpage); +static int khugepaged_node_load[MAX_NUMNODES]; + +#ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + VM_BUG_ON_PAGE(*hpage, *hpage); /* * Allocate the page while the vma is still valid and under * the mmap_sem read mode so there is no memory allocation @@ -1858,28 +2259,115 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. */ up_read(&mm->mmap_sem); - if (unlikely(!new_page)) { + if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return; + return NULL; } -#endif count_vm_event(THP_COLLAPSE_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { -#ifdef CONFIG_NUMA - put_page(new_page); + return *hpage; +} +#else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page +*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + return *hpage; +} #endif + +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + VM_BUG_ON(vma->vm_flags & VM_NO_THP); + return true; +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *pmd_ptl, *pte_ptl; + int isolated; + unsigned long hstart, hend; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + if (!new_page) + return; + + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) return; - } /* * Prevent all access to pagetables with the exception of @@ -1891,64 +2379,55 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; vma = find_vma(mm, address); + if (!vma) + goto out; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - goto out; - - if (!vma->anon_vma || vma->vm_ops) - goto out; - if (is_vma_temporary_stack(vma)) - goto out; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() must be - * true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + if (!hugepage_vma_check(vma)) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pmd = pmd_offset(pud, address); - /* pmd can't go away or become huge under us */ - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush_notify(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + _pmd = pmdp_clear_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); - spin_unlock(&mm->page_table_lock); - anon_vma_unlock(vma->anon_vma); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -1956,18 +2435,15 @@ static void collapse_huge_page(struct mm_struct *mm, * All pages are isolated and locked so anon_vma rmap * can't run anymore. */ - anon_vma_unlock(vma->anon_vma); + anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - VM_BUG_ON(page_count(pgtable) != 1); - VM_BUG_ON(page_mapcount(pgtable) != 0); - _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - _pmd = pmd_mkhuge(_pmd); /* * spin_lock() below is not the equivalent of smp_wmb(), so @@ -1976,17 +2452,16 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, _pmd); - prepare_pmd_huge_pte(pgtable, mm); - spin_unlock(&mm->page_table_lock); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); -#ifndef CONFIG_NUMA *hpage = NULL; -#endif + khugepaged_pages_collapsed++; out_up_write: up_write(&mm->mmap_sem); @@ -1994,9 +2469,6 @@ out_up_write: out: mem_cgroup_uncharge_page(new_page); -#ifdef CONFIG_NUMA - put_page(new_page); -#endif goto out_up_write; } @@ -2005,30 +2477,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, unsigned long address, struct page **hpage) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte, *_pte; int ret = 0, referenced = 0, none = 0; struct page *page; unsigned long _address; spinlock_t *ptl; - int node = -1; + int node = NUMA_NO_NODE; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2045,13 +2510,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == -1) - node = page_to_nid(page); - VM_BUG_ON(PageCompound(page)); + node = page_to_nid(page); + khugepaged_node_load[node]++; + VM_BUG_ON_PAGE(PageCompound(page), page); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; /* cannot use mapcount: can't collapse if there's a gup pin */ @@ -2065,9 +2531,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } @@ -2080,7 +2548,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) if (khugepaged_test_exit(mm)) { /* free mm_slot */ - hlist_del(&mm_slot->hash); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); /* @@ -2134,25 +2602,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - - if ((!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) { - skip: + if (!hugepage_vma_check(vma)) { +skip: progress++; continue; } - if (!vma->anon_vma || vma->vm_ops) - goto skip; - if (is_vma_temporary_stack(vma)) - goto skip; - /* - * If is_pfn_mapping() is true is_learn_pfn_mapping() - * must be true too, verify it here. - */ - VM_BUG_ON(is_linear_pfn_mapping(vma) || - vma->vm_flags & VM_NO_THP); - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart >= hend) @@ -2226,32 +2680,23 @@ static int khugepaged_has_work(void) static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; barrier(); /* write khugepaged_pages_to_scan to local stack */ while (progress < pages) { - cond_resched(); - -#ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - break; - } - count_vm_event(THP_COLLAPSE_ALLOC); - } -#else - if (IS_ERR(*hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#endif + + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break; @@ -2262,73 +2707,32 @@ static void khugepaged_do_scan(struct page **hpage) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - hpage); + &hpage); else progress = pages; spin_unlock(&khugepaged_mm_lock); } -} -static void khugepaged_alloc_sleep(void) -{ - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); -} - -#ifndef CONFIG_NUMA -static struct page *khugepaged_alloc_hugepage(void) -{ - struct page *hpage; - - do { - hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && - likely(khugepaged_enabled())); - return hpage; + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); } -#endif -static void khugepaged_loop(void) +static void khugepaged_wait_work(void) { - struct page *hpage; + try_to_freeze(); -#ifdef CONFIG_NUMA - hpage = NULL; -#endif - while (likely(khugepaged_enabled())) { -#ifndef CONFIG_NUMA - hpage = khugepaged_alloc_hugepage(); - if (unlikely(!hpage)) - break; -#else - if (IS_ERR(hpage)) { - khugepaged_alloc_sleep(); - hpage = NULL; - } -#endif + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; - khugepaged_do_scan(&hpage); -#ifndef CONFIG_NUMA - if (hpage) - put_page(hpage); -#endif - try_to_freeze(); - if (unlikely(kthread_should_stop())) - break; - if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) - continue; - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); - } else if (khugepaged_enabled()) - wait_event_freezable(khugepaged_wait, - khugepaged_wait_event()); + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) @@ -2338,20 +2742,9 @@ static int khugepaged(void *none) set_freezable(); set_user_nice(current, 19); - /* serialize with start_khugepaged() */ - mutex_lock(&khugepaged_mutex); - - for (;;) { - mutex_unlock(&khugepaged_mutex); - VM_BUG_ON(khugepaged_thread != current); - khugepaged_loop(); - VM_BUG_ON(khugepaged_thread != current); - - mutex_lock(&khugepaged_mutex); - if (!khugepaged_enabled()) - break; - if (unlikely(kthread_should_stop())) - break; + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); @@ -2360,58 +2753,109 @@ static int khugepaged(void *none) if (mm_slot) collect_mm_slot(mm_slot); spin_unlock(&khugepaged_mm_lock); + return 0; +} - khugepaged_thread = NULL; - mutex_unlock(&khugepaged_mutex); +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd; + int i; - return 0; + pmdp_clear_flush(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); } -void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd) { + spinlock_t *ptl; struct page *page; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ - spin_lock(&mm->page_table_lock); + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; +again: + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + if (is_huge_zero_pmd(*pmd)) { + __split_huge_zero_page_pmd(vma, haddr, pmd); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); put_page(page); - BUG_ON(pmd_trans_huge(*pmd)); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; +} + +void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, + pmd_t *pmd) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, address); + BUG_ON(vma == NULL); + split_huge_page_pmd(vma, address, pmd); } static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd_mm(mm, address, pmd); } void __vma_adjust_trans_huge(struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ae8f708e3d75..c82290b9c1fc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1,6 +1,6 @@ /* * Generic hugetlb support. - * (C) William Irwin, April 2004 + * (C) Nadia Yvette Chambers, April 2004 */ #include <linux/list.h> #include <linux/init.h> @@ -13,6 +13,7 @@ #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> +#include <linux/compiler.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/bootmem.h> @@ -21,20 +22,23 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/page-isolation.h> +#include <linux/jhash.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <linux/io.h> +#include <asm/tlb.h> +#include <linux/io.h> #include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int max_hstate; +int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -45,13 +49,18 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; -#define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) +/* + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. + */ +DEFINE_SPINLOCK(hugetlb_lock); /* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. */ -static DEFINE_SPINLOCK(hugetlb_lock); +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { @@ -128,22 +137,15 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode) static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { - return subpool_inode(vma->vm_file->f_dentry->d_inode); + return subpool_inode(file_inode(vma->vm_file)); } /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantion_mutex. To access or modify a region the caller - * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation mutex: - * - * down_write(&mm->mmap_sem); - * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock */ struct file_region { struct list_head link; @@ -151,10 +153,12 @@ struct file_region { long to; }; -static long region_add(struct list_head *head, long f, long t) +static long region_add(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -184,14 +188,18 @@ static long region_add(struct list_head *head, long f, long t) } nrg->from = f; nrg->to = t; + spin_unlock(&resv->lock); return 0; } -static long region_chg(struct list_head *head, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t) { - struct file_region *rg, *nrg; + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg = NULL; long chg = 0; +retry: + spin_lock(&resv->lock); /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -201,15 +209,21 @@ static long region_chg(struct list_head *head, long f, long t) * Subtle, allocate a new region at the position but make it zero * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } - return t - f; + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; } /* Round our left edge to the current segment if it encloses us. */ @@ -222,7 +236,7 @@ static long region_chg(struct list_head *head, long f, long t) if (&rg->link == head) break; if (rg->from > t) - return chg; + goto out; /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its @@ -233,20 +247,30 @@ static long region_chg(struct list_head *head, long f, long t) } chg -= rg->to - rg->from; } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); return chg; } -static long region_truncate(struct list_head *head, long end) +static long region_truncate(struct resv_map *resv, long end) { + struct list_head *head = &resv->regions; struct file_region *rg, *trg; long chg = 0; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (end <= rg->to) break; if (&rg->link == head) - return 0; + goto out; /* If we are in the middle of a region then adjust it. */ if (end > rg->from) { @@ -263,18 +287,23 @@ static long region_truncate(struct list_head *head, long end) list_del(&rg->link); kfree(rg); } + +out: + spin_unlock(&resv->lock); return chg; } -static long region_count(struct list_head *head, long f, long t) +static long region_count(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; + spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { - int seg_from; - int seg_to; + long seg_from; + long seg_to; if (rg->to <= f) continue; @@ -286,6 +315,7 @@ static long region_count(struct list_head *head, long f, long t) chg += seg_to - seg_from; } + spin_unlock(&resv->lock); return chg; } @@ -320,7 +350,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) hstate = hstate_vma(vma); - return 1UL << (hstate->order + PAGE_SHIFT); + return 1UL << huge_page_shift(hstate); } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); @@ -376,39 +406,46 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -struct resv_map { - struct kref refs; - struct list_head regions; -}; - -static struct resv_map *resv_map_alloc(void) +struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) return NULL; kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); return resv_map; } -static void resv_map_release(struct kref *ref) +void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); /* Clear out any active regions before we release the map. */ - region_truncate(&resv_map->regions, 0); + region_truncate(resv_map, 0); kfree(resv_map); } +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return NULL; + } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -435,25 +472,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct hstate *h, - struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_NORESERVE) - return; - - if (vma->vm_flags & VM_MAYSHARE) { - /* Shared mappings always use reserves */ - h->resv_huge_pages--; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - h->resv_huge_pages--; - } -} - /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { @@ -463,53 +481,42 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma, long chg) { + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return 1; + else + return 0; + } + + /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) return 1; + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; - return 0; -} -static void copy_gigantic_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } + return 0; } static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &h->hugepage_freelists[nid]); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; } @@ -518,19 +525,35 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); - list_del(&page->lru); + list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve) + unsigned long address, int avoid_reserve, + long chg) { struct page *page = NULL; struct mempolicy *mpol; @@ -540,16 +563,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zoneref *z; unsigned int cpuset_mems_cookie; -retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma) && + if (!vma_has_reserves(vma, chg) && h->free_huge_pages - h->resv_huge_pages == 0) goto err; @@ -557,25 +576,34 @@ retry_cpuset: if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); + if (avoid_reserve) + break; + if (!vma_has_reserves(vma, chg)) + break; + + SetPagePrivate(page); + h->resv_huge_pages--; break; } } } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; err: - mpol_cond_put(mpol); return NULL; } @@ -593,6 +621,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1 << PG_writeback); } + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); @@ -620,19 +649,29 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = (struct hugepage_subpool *)page_private(page); + bool restore_reserve; set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); - INIT_LIST_HEAD(&page->lru); + restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + /* remove the page from active list */ + list_del(&page->lru); update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); @@ -641,15 +680,18 @@ static void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { + INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void __init prep_compound_gigantic_page(struct page *page, + unsigned long order) { int i; int nr_pages = 1 << order; @@ -658,27 +700,71 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } } +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ int PageHuge(struct page *page) { - compound_page_dtor *dtor; - if (!PageCompound(page)) return 0; page = compound_head(page); - dtor = get_compound_page_dtor(page); - - return dtor == free_huge_page; + return get_compound_page_dtor(page) == free_huge_page; } EXPORT_SYMBOL_GPL(PageHuge); +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + if (!PageHead(page_head)) + return 0; + + return get_compound_page_dtor(page_head) == free_huge_page; +} + +pgoff_t __basepage_index(struct page *page) +{ + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + + if (!PageHuge(page_head)) + return page_index(page); + + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else + compound_idx = page - page_head; + + return (index << compound_order(page_head)) + compound_idx; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -687,7 +773,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -744,33 +830,6 @@ static int hstate_next_node_to_alloc(struct hstate *h, return nid; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) -{ - struct page *page; - int start_nid; - int next_nid; - int ret = 0; - - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - next_nid = start_nid; - - do { - page = alloc_fresh_huge_page_node(h, next_nid); - if (page) { - ret = 1; - break; - } - next_nid = hstate_next_node_to_alloc(h, nodes_allowed); - } while (next_nid != start_nid); - - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - - return ret; -} - /* * helper for free_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the @@ -789,6 +848,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +{ + struct page *page; + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page_node(h, node); + if (page) { + ret = 1; + break; + } + } + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + /* * Free huge page from pool from next node to free. * Attempt to keep persistent huge pages more or less @@ -798,40 +891,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { - int start_nid; - int next_nid; + int nr_nodes, node; int ret = 0; - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && - !list_empty(&h->hugepage_freelists[next_nid])) { + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { struct page *page = - list_entry(h->hugepage_freelists[next_nid].next, + list_entry(h->hugepage_freelists[node].next, struct page, lru); list_del(&page->lru); h->free_huge_pages--; - h->free_huge_pages_node[next_nid]--; + h->free_huge_pages_node[node]--; if (acct_surplus) { h->surplus_huge_pages--; - h->surplus_huge_pages_node[next_nid]--; + h->surplus_huge_pages_node[node]--; } update_and_free_page(h, page); ret = 1; break; } - next_nid = hstate_next_node_to_free(h, nodes_allowed); - } while (next_nid != start_nid); + } return ret; } +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; @@ -874,12 +1000,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -889,8 +1015,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (page) { + INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); + set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already */ @@ -914,10 +1042,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { - struct page *page; + struct page *page = NULL; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_node(h, nid); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); if (!page) @@ -993,25 +1122,20 @@ retry: list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; - list_del(&page->lru); /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ put_page_testzero(page); - VM_BUG_ON(page_count(page)); + VM_BUG_ON_PAGE(page_count(page), page); enqueue_huge_page(h, page); } free: spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ - if (!list_empty(&surplus_list)) { - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - list_del(&page->lru); - put_page(page); - } - } + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + put_page(page); spin_lock(&hugetlb_lock); return ret; @@ -1046,8 +1170,9 @@ static void return_unused_surplus_pages(struct hstate *h, * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) + if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) break; + cond_resched_lock(&hugetlb_lock); } } @@ -1064,45 +1189,34 @@ static void return_unused_surplus_pages(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - return region_chg(&inode->i_mapping->private_list, - idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; + long chg; - } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv = vma_resv_map(vma); + if (!resv) return 1; - } else { - long err; - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); - err = region_chg(&reservations->regions, idx, idx + 1); - if (err < 0) - return err; - return 0; - } + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; } static void vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - region_add(&inode->i_mapping->private_list, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + resv = vma_resv_map(vma); + if (!resv) + return; - /* Mark this page used in the map. */ - region_add(&reservations->regions, idx, idx + 1); - } + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1112,7 +1226,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct page *page; long chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + idx = hstate_index(h); /* * Processes that did not create the mapping will have no * reserves and will not have accounted against subpool @@ -1123,43 +1240,68 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) - return ERR_PTR(-VM_FAULT_OOM); - if (chg) - if (hugepage_subpool_get_pages(spool, chg)) - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOMEM); + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1)) + return ERR_PTR(-ENOSPC); + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) { + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - spin_unlock(&hugetlb_lock); - + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { + spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { - hugepage_subpool_put_pages(spool, chg); - return ERR_PTR(-VM_FAULT_SIGBUS); + hugetlb_cgroup_uncharge_cgroup(idx, + pages_per_huge_page(h), + h_cg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); } + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + /* Fall through */ } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + spin_unlock(&hugetlb_lock); set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); + return page; +} +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; return page; } int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + int nr_nodes, node; - while (nr_nodes) { + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic( - NODE_DATA(hstate_next_node_to_alloc(h, - &node_states[N_HIGH_MEMORY])), - huge_page_size(h), huge_page_size(h), 0); - + addr = memblock_virt_alloc_try_nid_nopanic( + huge_page_size(h), huge_page_size(h), + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); if (addr) { /* * Use the beginning of the huge page to store the @@ -1169,7 +1311,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - nr_nodes--; } return 0; @@ -1181,7 +1322,7 @@ found: return 1; } -static void prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -1200,14 +1341,14 @@ static void __init gather_bootmem_prealloc(void) #ifdef CONFIG_HIGHMEM page = pfn_to_page(m->phys >> PAGE_SHIFT); - free_bootmem_late((unsigned long)m, - sizeof(struct huge_bootmem_page)); + memblock_free_late(__pa(m), + sizeof(struct huge_bootmem_page)); #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need @@ -1216,7 +1357,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (h->order > (MAX_ORDER - 1)) - totalram_pages += 1 << h->order; + adjust_managed_page_count(page, 1 << h->order); } } @@ -1229,7 +1370,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, - &node_states[N_HIGH_MEMORY])) + &node_states[N_MEMORY])) break; } h->max_huge_pages = i; @@ -1263,8 +1404,7 @@ static void __init report_hugepages(void) for_each_hstate(h) { char buf[32]; - printk(KERN_INFO "HugeTLB registered %s page size, " - "pre-allocated %ld pages\n", + pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", memfmt(buf, huge_page_size(h)), h->free_huge_pages); } @@ -1309,48 +1449,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count, static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { - int start_nid, next_nid; - int ret = 0; + int nr_nodes, node; VM_BUG_ON(delta != -1 && delta != 1); - if (delta < 0) - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - else - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { - int nid = next_nid; - if (delta < 0) { - /* - * To shrink on this node, there must be a surplus page - */ - if (!h->surplus_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_alloc(h, - nodes_allowed); - continue; - } + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; } - if (delta > 0) { - /* - * Surplus cannot exceed the total number of pages - */ - if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_free(h, - nodes_allowed); - continue; - } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; } + } + return 0; - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (next_nid != start_nid); - - return ret; +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) @@ -1417,6 +1537,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -1480,7 +1601,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &count); + err = kstrtoul(buf, 10, &count); if (err) goto out; @@ -1497,7 +1618,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; } } else if (nodes_allowed) { /* @@ -1507,11 +1628,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; init_nodemask_of_node(nodes_allowed, nid); } else - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); return len; @@ -1571,7 +1692,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (h->order >= MAX_ORDER) return -EINVAL; - err = strict_strtoul(buf, 10, &input); + err = kstrtoul(buf, 10, &input); if (err) return err; @@ -1646,7 +1767,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct attribute_group *hstate_attr_group) { int retval; - int hi = h - hstates; + int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) @@ -1672,8 +1793,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - printk(KERN_ERR "Hugetlb: Unable to add hstate %s", - h->name); + pr_err("Hugetlb: Unable to add hstate %s", h->name); } } @@ -1733,7 +1853,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -void hugetlb_unregister_node(struct node *node) +static void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -1741,11 +1861,13 @@ void hugetlb_unregister_node(struct node *node) if (!nhs->hugepages_kobj) return; /* no hstate attributes */ - for_each_hstate(h) - if (nhs->hstate_kobjs[h - hstates]) { - kobject_put(nhs->hstate_kobjs[h - hstates]); - nhs->hstate_kobjs[h - hstates] = NULL; + for_each_hstate(h) { + int idx = hstate_index(h); + if (nhs->hstate_kobjs[idx]) { + kobject_put(nhs->hstate_kobjs[idx]); + nhs->hstate_kobjs[idx] = NULL; } + } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; @@ -1768,14 +1890,14 @@ static void hugetlb_unregister_all_nodes(void) * remove hstate attributes from any nodes that have them. */ for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(&node_devices[nid]); + hugetlb_unregister_node(node_devices[nid]); } /* * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -void hugetlb_register_node(struct node *node) +static void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -1794,9 +1916,8 @@ void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - printk(KERN_ERR "Hugetlb: Unable to add hstate %s" - " for node %d\n", - h->name, node->dev.id); + pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); hugetlb_unregister_node(node); break; } @@ -1812,8 +1933,8 @@ static void hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_HIGH_MEMORY) { - struct node *node = &node_devices[nid]; + for_each_node_state(nid, N_MEMORY) { + struct node *node = node_devices[nid]; if (node->dev.id == nid) hugetlb_register_node(node); } @@ -1848,20 +1969,19 @@ static void __exit hugetlb_exit(void) hugetlb_unregister_all_nodes(); for_each_hstate(h) { - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hstate_index(h)]); } kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); } module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + int i; + + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -1869,20 +1989,29 @@ static int __init hugetlb_init(void) if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); if (default_hstate_max_huge_pages) default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); - gather_bootmem_prealloc(); - report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -1894,20 +2023,21 @@ void __init hugetlb_add_hstate(unsigned order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + pr_warning("hugepagesz= specified twice, ignoring\n"); return; } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); - h = &hstates[max_hstate++]; + h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); h->nr_huge_pages = 0; h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); + INIT_LIST_HEAD(&h->hugepage_activelist); + h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1920,17 +2050,17 @@ static int __init hugetlb_nrpages_setup(char *s) static unsigned long *last_mhp; /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!max_hstate) + if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - printk(KERN_WARNING "hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warning("hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); return 1; } @@ -1942,7 +2072,7 @@ static int __init hugetlb_nrpages_setup(char *s) * But we need to allocate >= MAX_ORDER hstates here early to still * use the bootmem allocator. */ - if (max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -1978,6 +2108,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -1995,11 +2128,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_HIGH_MEMORY]; + nodes_allowed = &node_states[N_MEMORY]; } h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); } out: @@ -2023,18 +2156,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -2043,6 +2164,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2068,6 +2192,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2084,6 +2210,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2093,11 +2221,33 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, h->surplus_huge_pages_node[nid]); } +void hugetlb_show_meminfo(void) +{ + struct hstate *h; + int nid; + + if (!hugepages_supported()) + return; + + for_each_node_state(nid, N_MEMORY) + for_each_hstate(h) + pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) @@ -2143,7 +2293,7 @@ out: static void hugetlb_vm_op_open(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* * This new VMA should share its siblings reservation map if present. @@ -2153,32 +2303,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (reservations) - kref_get(&reservations->refs); + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_get(&resv->refs); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); - unsigned long reserve; - unsigned long start; - unsigned long end; + unsigned long reserve, start, end; - if (reservations) { - start = vma_hugecache_offset(h, vma, vma->vm_start); - end = vma_hugecache_offset(h, vma, vma->vm_end); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; - reserve = (end - start) - - region_count(&reservations->regions, start, end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); - kref_put(&reservations->refs, resv_map_release); + reserve = (end - start) - region_count(resv, start, end); - if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); - } + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugepage_subpool_put_pages(spool, reserve); } } @@ -2206,13 +2354,15 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, pte_t entry; if (writable) { - entry = - pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, + vma->vm_page_prot))); } else { - entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = huge_pte_wrprotect(mk_huge_pte(page, + vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = pte_mkhuge(entry); + entry = arch_make_huge_pte(entry, vma, page, writable); return entry; } @@ -2222,7 +2372,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } @@ -2237,23 +2387,35 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + int ret = 0; cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + mmun_start = vma->vm_start; + mmun_end = vma->vm_end; + if (cow) + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); - if (!dst_pte) - goto nomem; + if (!dst_pte) { + ret = -ENOMEM; + break; + } /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2263,13 +2425,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); } - return 0; -nomem: - return -ENOMEM; + if (cow) + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + + return ret; } static int is_hugetlb_entry_migration(pte_t pte) @@ -2298,48 +2461,49 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) return 0; } -void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { + int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; - struct page *tmp; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - - /* - * A page gathering list, protected by per file i_mmap_mutex. The - * lock is used to avoid list corruption from multiple unmapping - * of the same page since we are using page->lru. - */ - LIST_HEAD(page_list); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); - mmu_notifier_invalidate_range_start(mm, start, end); - spin_lock(&mm->page_table_lock); + tlb_start_vma(tlb, vma); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); +again: for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) - continue; + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + huge_pte_clear(mm, address, ptep); + goto unlock; + } page = pte_page(pte); /* @@ -2349,7 +2513,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2360,30 +2524,69 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } pte = huge_ptep_get_and_clear(mm, address, ptep); - if (pte_dirty(pte)) + tlb_remove_tlb_entry(tlb, ptep, address); + if (huge_pte_dirty(pte)) set_page_dirty(page); - list_add(&page->lru, &page_list); + page_remove_rmap(page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) { + spin_unlock(ptl); + break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - flush_tlb_range(vma, start, end); - spin_unlock(&mm->page_table_lock); - mmu_notifier_invalidate_range_end(mm, start, end); - list_for_each_entry_safe(page, tmp, &page_list, lru) { - page_remove_rmap(page); - list_del(&page->lru); - put_page(page); + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (address < end && !ref_page) + goto again; } + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + tlb_end_vma(tlb, vma); +} + +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + __unmap_hugepage_range(tlb, vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_mutex is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - __unmap_hugepage_range(vma, start, end, ref_page); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + struct mm_struct *mm; + struct mmu_gather tlb; + + mm = vma->vm_mm; + + tlb_gather_mmu(&tlb, mm, start, end); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2398,7 +2601,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2406,8 +2608,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = vma_hugecache_offset(h, vma, address); - mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + mapping = file_inode(vma->vm_file)->i_mapping; /* * Take the mapping lock for the duration of the table walk. As @@ -2415,7 +2618,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; @@ -2428,9 +2631,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - __unmap_hugepage_range(iter_vma, - address, address + huge_page_size(h), - page); + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); @@ -2445,22 +2647,21 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) + struct page *pagecache_page, spinlock_t *ptl) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int avoidcopy; int outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_page = pte_page(pte); retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_mapcount(old_page) == 1); - if (avoidcopy) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2474,18 +2675,18 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_MAYSHARE) && - is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; page_cache_get(old_page); - /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + /* Drop page table lock as buddy allocator may be called */ + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { + long err = PTR_ERR(new_page); page_cache_release(old_page); /* @@ -2499,13 +2700,14 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page_table_lock, and - * our job is done. + * race occurs while re-acquiring page table + * lock, and our job is done. */ return 0; } @@ -2513,8 +2715,11 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); - return -PTR_ERR(new_page); + spin_lock(ptl); + if (err == -ENOMEM) + return VM_FAULT_OOM; + else + return VM_FAULT_SIGBUS; } /* @@ -2525,7 +2730,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2533,17 +2738,19 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page_table_lock to check for racing updates + * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) { + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + /* Break COW */ - mmu_notifier_invalidate_range_start(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); @@ -2551,12 +2758,14 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; - mmu_notifier_invalidate_range_end(mm, - address & huge_page_mask(h), - (address & huge_page_mask(h)) + huge_page_size(h)); } + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); + + /* Caller expects lock to be held */ + spin_lock(ptl); return 0; } @@ -2594,16 +2803,16 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags) + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; int anon_rmap = 0; - pgoff_t idx; unsigned long size; struct page *page; - struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2611,15 +2820,11 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { - printk(KERN_WARNING - "PID %d killed due to inadequate hugepage pool\n", - current->pid); + pr_warning("PID %d killed due to inadequate hugepage pool\n", + current->pid); return ret; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - /* * Use page lock to guard against racing truncation * before we get page_table_lock. @@ -2632,7 +2837,11 @@ retry: goto out; page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { - ret = -PTR_ERR(page); + ret = PTR_ERR(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -2649,6 +2858,7 @@ retry: goto retry; goto out; } + ClearPagePrivate(page); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -2669,7 +2879,7 @@ retry: */ if (unlikely(PageHWPoison(page))) { ret = VM_FAULT_HWPOISON | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } } @@ -2686,7 +2896,8 @@ retry: goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2695,9 +2906,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; - if (anon_rmap) + if (anon_rmap) { + ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); - else + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -2705,32 +2917,69 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); goto out; } +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep; - pte_t entry; + pte_t *ptep, entry; + spinlock_t *ptl; int ret; + u32 hash; + pgoff_t idx; struct page *page = NULL; struct page *pagecache_page = NULL; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + struct address_space *mapping; address &= huge_page_mask(h); @@ -2738,26 +2987,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(vma, mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, flags); + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; } @@ -2771,7 +3025,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2794,27 +3048,28 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + goto out_ptl; if (flags & FAULT_FLAG_WRITE) { - if (!pte_write(entry)) { + if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page); - goto out_page_table_lock; + pagecache_page, ptl); + goto out_ptl; } - entry = pte_mkdirty(entry); + entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_page_table_lock: - spin_unlock(&mm->page_table_lock); +out_ptl: + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -2825,33 +3080,23 @@ out_page_table_lock: put_page(page); out_mutex: - mutex_unlock(&hugetlb_instantiation_mutex); - + mutex_unlock(&htlb_fault_mutex_table[hash]); return ret; } -/* Can be overriden by architectures */ -__attribute__((weak)) struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - BUG(); - return NULL; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i, - unsigned int flags) +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; - int remainder = *length; + unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -2859,8 +3104,12 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -2872,18 +3121,31 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } - if (absent || - ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || + ((flags & FOLL_WRITE) && + !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -2896,7 +3158,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page(pages[i]); + get_page_foll(pages[i]); } if (vmas) @@ -2914,15 +3176,15 @@ same_page: */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); - *length = remainder; + *nr_pages = remainder; *position = vaddr; return i ? i : -EFAULT; } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; @@ -2930,28 +3192,44 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); + unsigned long pages = 0; BUG_ON(address >= end); flush_cache_range(vma, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; - if (huge_pmd_unshare(mm, &address, ptep)) + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, &address, ptep)) { + pages++; + spin_unlock(ptl); continue; + } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); - pte = pte_mkhuge(pte_modify(pte, newprot)); + pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte = arch_make_huge_pte(pte, vma, NULL, 0); set_huge_pte_at(mm, address, ptep, pte); + pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ + flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range_end(mm, start, end); - flush_tlb_range(vma, start, end); + return pages << h->order; } int hugetlb_reserve_pages(struct inode *inode, @@ -2962,6 +3240,7 @@ int hugetlb_reserve_pages(struct inode *inode, long ret, chg; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; /* * Only apply hugepage reservation if asked. At fault time, an @@ -2977,10 +3256,13 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to); + + } else { + resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -2990,12 +3272,16 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) - return chg; + if (chg < 0) { + ret = chg; + goto out_err; + } /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) - return -ENOSPC; + if (hugepage_subpool_get_pages(spool, chg)) { + ret = -ENOSPC; + goto out_err; + } /* * Check enough hugepages are available for the reservation. @@ -3004,7 +3290,7 @@ int hugetlb_reserve_pages(struct inode *inode, ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugepage_subpool_put_pages(spool, chg); - return ret; + goto out_err; } /* @@ -3019,16 +3305,23 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&inode->i_mapping->private_list, from, to); + region_add(resv_map, from, to); return 0; +out_err: + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); + return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + if (resv_map) + chg = region_truncate(resv_map, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3037,6 +3330,218 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vm_flags != svm_flags || + sbase < svma->vm_start || svma->vm_end < s_end) + return 0; + + return saddr; +} + +static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & PUD_MASK; + unsigned long end = base + PUD_SIZE; + + /* + * check on proper vm_flags and page table alignment + */ + if (vma->vm_flags & VM_MAYSHARE && + vma->vm_start <= base && end <= vma->vm_end) + return 1; + return 0; +} + +/* + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_mutex section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. + */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + pte_t *pte; + spinlock_t *ptl; + + if (!vma_shareable(vma, addr)) + return (pte_t *)pmd_alloc(mm, pud, addr); + + mutex_lock(&mapping->i_mmap_mutex); + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr); + if (spte) { + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); + if (pud_none(*pud)) + pud_populate(mm, pud, + (pmd_t *)((unsigned long)spte & PAGE_MASK)); + else + put_page(virt_to_page(spte)); + spin_unlock(ptl); +out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); + mutex_unlock(&mapping->i_mmap_mutex); + return pte; +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * called with page table lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, *addr); + pud_t *pud = pud_offset(pgd, *addr); + + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + return 1; +} +#define want_pmd_share() (1) +#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + return NULL; +} +#define want_pmd_share() (0) +#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (pud) { + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (want_pmd_share() && pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); + + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (pud_present(*pud)) { + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + } + } + return (pte_t *) pmd; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pmd); + if (page) + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + return page; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); + return page; +} + +#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + +/* Can be overriden by architectures */ +struct page * __weak +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + #ifdef CONFIG_MEMORY_FAILURE /* Should be called in hugetlb_lock */ @@ -3065,7 +3570,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) spin_lock(&hugetlb_lock); if (is_hugepage_on_freelist(hpage)) { - list_del(&hpage->lru); + /* + * Hwpoisoned hugepage isn't linked to activelist or freelist, + * but dangling hpage->lru can trigger list-debug warnings + * (this happens when we call unpoison_memory() on it), + * so let it point to itself with list_del_init(). + */ + list_del_init(&hpage->lru); set_page_refcounted(hpage); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -3075,3 +3586,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) return ret; } #endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + if (!get_page_unless_zero(page)) + return false; + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, list); + spin_unlock(&hugetlb_lock); + return true; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} + +bool is_hugepage_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + /* + * This function can be called for a tail page because the caller, + * scan_movable_pages, scans through a given pfn-range which typically + * covers one memory block. In systems using gigantic hugepage (1GB + * for x86_64,) a hugepage is larger than a memory block, and we don't + * support migrating such large hugepages for now, so return false + * when called for tail pages. + */ + if (PageTail(page)) + return false; + /* + * Refcount of a hwpoisoned hugepages is 1, but they are not active, + * so we should return false for them. + */ + if (unlikely(PageHWPoison(page))) + return false; + return page_count(page) > 0; +} diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000000..595d7fd795e1 --- /dev/null +++ b/mm/hugetlb_cgroup.c @@ -0,0 +1,408 @@ +/* + * + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> + +struct hugetlb_cgroup { + struct cgroup_subsys_state css; + /* + * the counter to account for hugepages from hugetlb. + */ + struct res_counter hugepage[HUGE_MAX_HSTATE]; +}; + +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +static struct hugetlb_cgroup *root_h_cgroup __read_mostly; + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) +{ + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); +} + +static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) +{ + return (h_cg == root_h_cgroup); +} + +static inline struct hugetlb_cgroup * +parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) +{ + return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); +} + +static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) +{ + int idx; + + for (idx = 0; idx < hugetlb_max_hstate; idx++) { + if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) + return true; + } + return false; +} + +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; + int idx; + + h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); + if (!h_cgroup) + return ERR_PTR(-ENOMEM); + + if (parent_h_cgroup) { + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + res_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); + } else { + root_h_cgroup = h_cgroup; + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + res_counter_init(&h_cgroup->hugepage[idx], NULL); + } + return &h_cgroup->css; +} + +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct hugetlb_cgroup *h_cgroup; + + h_cgroup = hugetlb_cgroup_from_css(css); + kfree(h_cgroup); +} + + +/* + * Should be called with hugetlb_lock held. + * Since we are holding hugetlb_lock, pages cannot get moved from + * active list or uncharged from the cgroup, So no need to get + * page reference and test for page active here. This function + * cannot fail. + */ +static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, + struct page *page) +{ + int csize; + struct res_counter *counter; + struct res_counter *fail_res; + struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + + page_hcg = hugetlb_cgroup_from_page(page); + /* + * We can have pages in active list without any cgroup + * ie, hugepage with less than 3 pages. We can safely + * ignore those pages. + */ + if (!page_hcg || page_hcg != h_cg) + goto out; + + csize = PAGE_SIZE << compound_order(page); + if (!parent) { + parent = root_h_cgroup; + /* root has no limit */ + res_counter_charge_nofail(&parent->hugepage[idx], + csize, &fail_res); + } + counter = &h_cg->hugepage[idx]; + res_counter_uncharge_until(counter, counter->parent, csize); + + set_hugetlb_cgroup(page, parent); +out: + return; +} + +/* + * Force the hugetlb cgroup to empty the hugetlb resources by moving them to + * the parent cgroup. + */ +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hstate *h; + struct page *page; + int idx = 0; + + do { + for_each_hstate(h) { + spin_lock(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(idx, h_cg, page); + + spin_unlock(&hugetlb_lock); + idx++; + } + cond_resched(); + } while (hugetlb_cgroup_have_usage(h_cg)); +} + +int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + int ret = 0; + struct res_counter *fail_res; + struct hugetlb_cgroup *h_cg = NULL; + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled()) + goto done; + /* + * We don't charge any cgroup if the compound page have less + * than 3 pages. + */ + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + goto done; +again: + rcu_read_lock(); + h_cg = hugetlb_cgroup_from_task(current); + if (!css_tryget(&h_cg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); + css_put(&h_cg->css); +done: + *ptr = h_cg; + return ret; +} + +/* Should be called with hugetlb_lock held */ +void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + set_hugetlb_cgroup(page, h_cg); + return; +} + +/* + * Should be called with hugetlb_lock held + */ +void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page) +{ + struct hugetlb_cgroup *h_cg; + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled()) + return; + VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + h_cg = hugetlb_cgroup_from_page(page); + if (unlikely(!h_cg)) + return; + set_hugetlb_cgroup(page, NULL); + res_counter_uncharge(&h_cg->hugepage[idx], csize); + return; +} + +void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + return; + + res_counter_uncharge(&h_cg->hugepage[idx], csize); + return; +} + +static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + int idx, name; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + + idx = MEMFILE_IDX(cft->private); + name = MEMFILE_ATTR(cft->private); + + return res_counter_read_u64(&h_cg->hugepage[idx], name); +} + +static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, + struct cftype *cft, char *buffer) +{ + int idx, name, ret; + unsigned long long val; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + + idx = MEMFILE_IDX(cft->private); + name = MEMFILE_ATTR(cft->private); + + switch (name) { + case RES_LIMIT: + if (hugetlb_cgroup_is_root(h_cg)) { + /* Can't set limit on root */ + ret = -EINVAL; + break; + } + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + ret = res_counter_set_limit(&h_cg->hugepage[idx], val); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, + unsigned int event) +{ + int idx, name, ret = 0; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + + idx = MEMFILE_IDX(event); + name = MEMFILE_ATTR(event); + + switch (name) { + case RES_MAX_USAGE: + res_counter_reset_max(&h_cg->hugepage[idx]); + break; + case RES_FAILCNT: + res_counter_reset_failcnt(&h_cg->hugepage[idx]); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static char *mem_fmt(char *buf, int size, unsigned long hsize) +{ + if (hsize >= (1UL << 30)) + snprintf(buf, size, "%luGB", hsize >> 30); + else if (hsize >= (1UL << 20)) + snprintf(buf, size, "%luMB", hsize >> 20); + else + snprintf(buf, size, "%luKB", hsize >> 10); + return buf; +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->read_u64 = hugetlb_cgroup_read_u64; + cft->write_string = hugetlb_cgroup_write; + + /* Add the usage file */ + cft = &h->cgroup_files[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the MAX usage file */ + cft = &h->cgroup_files[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); + cft->trigger = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the failcntfile */ + cft = &h->cgroup_files[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->trigger = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files[4]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); + + return; +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } +} + +/* + * hugetlb_lock will make sure a parallel cgroup rmdir won't happen + * when we migrate hugepages + */ +void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +{ + struct hugetlb_cgroup *h_cg; + struct hstate *h = page_hstate(oldhpage); + + if (hugetlb_cgroup_disabled()) + return; + + VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); + spin_lock(&hugetlb_lock); + h_cg = hugetlb_cgroup_from_page(oldhpage); + set_hugetlb_cgroup(oldhpage, NULL); + + /* move the h_cg details to new cgroup */ + set_hugetlb_cgroup(newhpage, h_cg); + list_move(&newhpage->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); + return; +} + +struct cgroup_subsys hugetlb_cgrp_subsys = { + .css_alloc = hugetlb_cgroup_css_alloc, + .css_offline = hugetlb_cgroup_css_offline, + .css_free = hugetlb_cgroup_css_free, +}; diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb983ba..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!hwpoison_filter_enable) - goto inject; if (!pfn_valid(pfn)) return -ENXIO; @@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val) if (!get_page_unless_zero(hpage)) return 0; + if (!hwpoison_filter_enable) + goto inject; + if (!PageLRU(p) && !PageHuge(p)) shake_page(p, 0); /* @@ -54,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) return 0; inject: - printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); } @@ -88,12 +89,12 @@ static int pfn_inject_init(void) * hardware status change, hence do not require hardware support. * They are mainly for testing hwpoison in software level. */ - dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL, &hwpoison_fops); if (!dentry) goto fail; - dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL, &unpoison_fops); if (!dentry) goto fail; @@ -123,7 +124,7 @@ static int pfn_inject_init(void) if (!dentry) goto fail; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir, &hwpoison_filter_memcg); if (!dentry) diff --git a/mm/internal.h b/mm/internal.h index 2189af491783..07b67361a40a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include <linux/fs.h> #include <linux/mm.h> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,22 +22,31 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageTail(page)); - VM_BUG_ON(atomic_read(&page->_count)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(atomic_read(&page->_count), page); set_page_count(page, 1); } -static inline void __put_page(struct page *page) -{ - atomic_dec(&page->_count); -} - static inline void __get_page_tail_foll(struct page *page, bool get_page_head) { @@ -51,12 +61,10 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); if (get_page_head) atomic_inc(&page->first_page->_count); - atomic_inc(&page->_mapcount); + get_huge_page_tail(page); } /* @@ -78,7 +86,7 @@ static inline void get_page_foll(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); atomic_inc(&page->_count); } } @@ -90,6 +98,12 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern bool zone_reclaimable(struct zone *zone); + +/* + * in mm/rmap.c: + */ +extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c @@ -99,12 +113,55 @@ extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif +extern int user_min_free_kbytes; +#if defined CONFIG_COMPACTION || defined CONFIG_CMA /* - * function for dealing with page's order in buddy system. - * zone->lock is already acquired when we use these. - * So, we don't need atomic page->flags operations here. + * in mm/compaction.c + */ +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool finished_update_free; /* True when the zone cached pfns are + * no longer being updated + */ + bool finished_update_migrate; + + int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; + bool contended; /* True if a lock was contended */ +}; + +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); +unsigned long +isolate_migratepages_range(struct zone *zone, struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn, bool unevictable); + +#endif + +/* + * This function returns the order of a free page in the buddy system. In + * general, page_zone(page)->lock must be held by the caller to prevent the + * page from being allocated in parallel and returning garbage as the order. + * If a caller does not hold page_zone(page)->lock, it must guarantee that the + * page cannot be allocated or merged in parallel. */ static inline unsigned long page_order(struct page *page) { @@ -117,8 +174,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); +extern long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); static inline void munlock_vma_pages_all(struct vm_area_struct *vma) @@ -127,19 +184,20 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path via page_evictable() for a new page - * to determine if it's being mapped into a LOCKED vma. - * If so, mark page as mlocked. + * Called only in fault path, to determine if a new page is being + * mapped into a LOCKED vma. If it is, mark page as mlocked. */ -static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +static inline int mlocked_vma_newpage(struct vm_area_struct *vma, + struct page *page) { - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) return 0; if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } return 1; @@ -149,7 +207,7 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +extern unsigned int munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -160,12 +218,7 @@ extern void munlock_vma_page(struct page *page); * If called for a page that is still mapped by mlocked vmas, all we do * is revert to lazy LRU behaviour -- semantics are not broken. */ -extern void __clear_page_mlock(struct page *page); -static inline void clear_page_mlock(struct page *page) -{ - if (unlikely(TestClearPageMlocked(page))) - __clear_page_mlock(page); -} +extern void clear_page_mlock(struct page *page); /* * mlock_migrate_page - called only from migrate_page_copy() to @@ -175,21 +228,24 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) { return 0; } @@ -299,7 +355,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 -#endif extern int hwpoison_filter(struct page *p); @@ -309,3 +364,27 @@ extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; + +extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + +extern void set_pageblock_order(void); +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ + +#endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..4a5822a586e6 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,112 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse <walken@google.com> + * + * This file is released under the GPL v2. + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rmap.h> +#include <linux/interval_tree_generic.h> + +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, + unsigned long, shared.linear.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last = vma_last_pgoff(node); + + VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); + + if (!prev->shared.linear.rb.rb_right) { + parent = prev; + link = &prev->shared.linear.rb.rb_right; + } else { + parent = rb_entry(prev->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + node->shared.linear.rb_subtree_last = last; + rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&node->shared.linear.rb, root, + &vma_interval_tree_augment); +} + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/iov_iter.c b/mm/iov_iter.c new file mode 100644 index 000000000000..10e46cd721de --- /dev/null +++ b/mm/iov_iter.c @@ -0,0 +1,224 @@ +#include <linux/export.h> +#include <linux/uio.h> +#include <linux/pagemap.h> + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +EXPORT_SYMBOL(copy_page_to_iter); + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 45eb6217bf38..8d2fcdfeff7f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -29,7 +29,7 @@ * - kmemleak_lock (rwlock): protects the object_list modifications and * accesses to the object_tree_root. The object_list is the main list * holding the metadata (struct kmemleak_object) for the allocated memory - * blocks. The object_tree_root is a priority search tree used to look-up + * blocks. The object_tree_root is a red black tree used to look-up * metadata based on a pointer to the corresponding memory block. The * kmemleak_object structures are added to the object_list and * object_tree_root in the create_object() function called from the @@ -71,7 +71,7 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/kthread.h> -#include <linux/prio_tree.h> +#include <linux/rbtree.h> #include <linux/fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -132,7 +132,7 @@ struct kmemleak_scan_area { * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the * object->lock. Insertions or deletions from object_list, gray_list or - * tree_node are already protected by the corresponding locks or mutex (see + * rb_node are already protected by the corresponding locks or mutex (see * the notes on locking above). These objects are reference-counted * (use_count) and freed using the RCU mechanism. */ @@ -141,7 +141,7 @@ struct kmemleak_object { unsigned long flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; - struct prio_tree_node tree_node; + struct rb_node rb_node; struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; @@ -182,9 +182,9 @@ struct kmemleak_object { static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); -/* prio search tree for object boundaries */ -static struct prio_tree_root object_tree_root; -/* rw_lock protecting the access to object_list and prio_tree_root */ +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* rw_lock protecting the access to object_list and object_tree_root */ static DEFINE_RWLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ @@ -192,15 +192,15 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +static int kmemleak_enabled; /* set in the late_initcall if there were no errors */ -static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ -static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ -static atomic_t kmemleak_warning = ATOMIC_INIT(0); +static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ -static atomic_t kmemleak_error = ATOMIC_INIT(0); +static int kmemleak_error; /* minimum and maximum address that may be valid pointers */ static unsigned long min_addr = ULONG_MAX; @@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1; static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ static int kmemleak_skip_disable; - +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -267,7 +268,7 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warning(x); \ dump_stack(); \ - atomic_set(&kmemleak_warning, 1); \ + kmemleak_warning = 1; \ } while (0) /* @@ -380,7 +381,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.entries = object->trace; pr_notice("Object 0x%08lx (size %zu):\n", - object->tree_node.start, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); @@ -392,32 +393,32 @@ static void dump_object_info(struct kmemleak_object *object) } /* - * Look-up a memory block metadata (kmemleak_object) in the priority search + * Look-up a memory block metadata (kmemleak_object) in the object search * tree based on a pointer value. If alias is 0, only values pointing to the * beginning of the memory block are allowed. The kmemleak_lock must be held * when calling this function. */ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { - struct prio_tree_node *node; - struct prio_tree_iter iter; - struct kmemleak_object *object; - - prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); - node = prio_tree_next(&iter); - if (node) { - object = prio_tree_entry(node, struct kmemleak_object, - tree_node); - if (!alias && object->pointer != ptr) { + struct rb_node *rb = object_tree_root.rb_node; + + while (rb) { + struct kmemleak_object *object = + rb_entry(rb, struct kmemleak_object, rb_node); + if (ptr < object->pointer) + rb = object->rb_node.rb_left; + else if (object->pointer + object->size <= ptr) + rb = object->rb_node.rb_right; + else if (object->pointer == ptr || alias) + return object; + else { kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); - object = NULL; + break; } - } else - object = NULL; - - return object; + } + return NULL; } /* @@ -436,7 +437,7 @@ static int get_object(struct kmemleak_object *object) */ static void free_object_rcu(struct rcu_head *rcu) { - struct hlist_node *elem, *tmp; + struct hlist_node *tmp; struct kmemleak_scan_area *area; struct kmemleak_object *object = container_of(rcu, struct kmemleak_object, rcu); @@ -445,8 +446,8 @@ static void free_object_rcu(struct rcu_head *rcu) * Once use_count is 0 (guaranteed by put_object), there is no other * code accessing this object, hence no need for locking. */ - hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { - hlist_del(elem); + hlist_for_each_entry_safe(area, tmp, &object->area_list, node) { + hlist_del(&area->node); kmem_cache_free(scan_area_cache, area); } kmem_cache_free(object_cache, object); @@ -471,7 +472,7 @@ static void put_object(struct kmemleak_object *object) } /* - * Look up an object in the prio search tree and increase its use_count. + * Look up an object in the object search tree and increase its use_count. */ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { @@ -516,8 +517,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp) { unsigned long flags; - struct kmemleak_object *object; - struct prio_tree_node *node; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -560,31 +561,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_len = __save_stack_trace(object->trace); - INIT_PRIO_TREE_NODE(&object->tree_node); - object->tree_node.start = ptr; - object->tree_node.last = ptr + size - 1; - write_lock_irqsave(&kmemleak_lock, flags); min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); - node = prio_tree_insert(&object_tree_root, &object->tree_node); - /* - * The code calling the kernel does not yet have the pointer to the - * memory block to be able to free it. However, we still hold the - * kmemleak_lock here in case parts of the kernel started freeing - * random memory blocks. - */ - if (node != &object->tree_node) { - kmemleak_stop("Cannot insert 0x%lx into the object search tree " - "(already existing)\n", ptr); - object = lookup_object(ptr, 1); - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); - - goto out; + link = &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + if (ptr + size <= parent->pointer) + link = &parent->rb_node.rb_left; + else if (parent->pointer + parent->size <= ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object " + "search tree (overlaps existing)\n", + ptr); + kmem_cache_free(object_cache, object); + object = parent; + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + goto out; + } } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, &object_tree_root); + list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); @@ -600,7 +604,7 @@ static void __delete_object(struct kmemleak_object *object) unsigned long flags; write_lock_irqsave(&kmemleak_lock, flags); - prio_tree_remove(&object_tree_root, &object->tree_node); + rb_erase(&object->rb_node, &object_tree_root); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); @@ -750,7 +754,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } spin_lock_irqsave(&object->lock, flags); - if (ptr + size > object->pointer + object->size) { + if (size == SIZE_MAX) { + size = object->pointer + object->size - ptr; + } else if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -800,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* kmemleak stopped recording, just count the requests */ crt_early_log++; return; @@ -835,7 +841,7 @@ static void early_alloc(struct early_log *log) unsigned long flags; int i; - if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) return; /* @@ -888,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -914,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) * Percpu allocations are only scanned and not reported as leaks * (min_count is set to 0). */ - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, GFP_KERNEL); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -934,9 +940,9 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -954,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -974,11 +980,11 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -994,9 +1000,9 @@ void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1014,9 +1020,9 @@ void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1036,9 +1042,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1056,9 +1062,9 @@ void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1083,7 +1089,7 @@ static bool update_checksum(struct kmemleak_object *object) */ static int scan_should_stop(void) { - if (!atomic_read(&kmemleak_enabled)) + if (!kmemleak_enabled) return 1; /* @@ -1174,7 +1180,6 @@ static void scan_block(void *_start, void *_end, static void scan_object(struct kmemleak_object *object) { struct kmemleak_scan_area *area; - struct hlist_node *elem; unsigned long flags; /* @@ -1202,7 +1207,7 @@ static void scan_object(struct kmemleak_object *object) spin_lock_irqsave(&object->lock, flags); } } else - hlist_for_each_entry(area, elem, &object->area_list, node) + hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), object, 0); @@ -1297,9 +1302,8 @@ static void kmemleak_scan(void) */ lock_memory_hotplug(); for_each_online_node(i) { - pg_data_t *pgdat = NODE_DATA(i); - unsigned long start_pfn = pgdat->node_start_pfn; - unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long start_pfn = node_start_pfn(i); + unsigned long end_pfn = node_end_pfn(i); unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -1379,9 +1383,12 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - if (new_leaks) + if (new_leaks) { + kmemleak_found_leaks = true; + pr_info("%d new suspected memory leaks (see " "/sys/kernel/debug/kmemleak)\n", new_leaks); + } } @@ -1483,13 +1490,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct kmemleak_object *prev_obj = v; struct kmemleak_object *next_obj = NULL; - struct list_head *n = &prev_obj->object_list; + struct kmemleak_object *obj = prev_obj; ++(*pos); - list_for_each_continue_rcu(n, &object_list) { - struct kmemleak_object *obj = - list_entry(n, struct kmemleak_object, object_list); + list_for_each_entry_continue_rcu(obj, &object_list, object_list) { if (get_object(obj)) { next_obj = obj; break; @@ -1544,18 +1549,14 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int kmemleak_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - static int dump_str_object_info(const char *str) { unsigned long flags; struct kmemleak_object *object; unsigned long addr; - addr= simple_strtoul(str, NULL, 0); + if (kstrtoul(str, 0, &addr)) + return -EINVAL; object = find_and_get_object(addr, 0); if (!object) { pr_info("Unknown object at 0x%08lx\n", addr); @@ -1590,8 +1591,12 @@ static void kmemleak_clear(void) spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + + kmemleak_found_leaks = false; } +static void __kmemleak_do_cleanup(void); + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1604,7 +1609,8 @@ static void kmemleak_clear(void) * disable it) * scan - trigger a memory scan * clear - mark all current reported unreferenced kmemleak objects as - * grey to ignore printing them + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, @@ -1614,9 +1620,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1626,6 +1629,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, if (ret < 0) return ret; + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EBUSY; + goto out; + } + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1639,7 +1655,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - ret = strict_strtoul(buf + 5, 0, &secs); + ret = kstrtoul(buf + 5, 0, &secs); if (ret < 0) goto out; stop_scan_thread(); @@ -1649,8 +1665,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); - else if (strncmp(buf, "clear", 5) == 0) - kmemleak_clear(); else if (strncmp(buf, "dump=", 5) == 0) ret = dump_str_object_info(buf + 5); else @@ -1672,9 +1686,19 @@ static const struct file_operations kmemleak_fops = { .read = seq_read, .write = kmemleak_write, .llseek = seq_lseek, - .release = kmemleak_release, + .release = seq_release, }; +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + /* * Stop the memory scanning thread and free the kmemleak internal objects if * no previous scan thread (otherwise, kmemleak may still have some useful @@ -1682,18 +1706,14 @@ static const struct file_operations kmemleak_fops = { */ static void kmemleak_do_cleanup(struct work_struct *work) { - struct kmemleak_object *object; - bool cleanup = scan_thread == NULL; - mutex_lock(&scan_mutex); stop_scan_thread(); - if (cleanup) { - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); - } + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); mutex_unlock(&scan_mutex); } @@ -1706,14 +1726,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); static void kmemleak_disable(void) { /* atomically check whether it was already invoked */ - if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + if (cmpxchg(&kmemleak_error, 0, 1)) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_enabled, 0); + kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (atomic_read(&kmemleak_initialized)) + if (kmemleak_initialized) schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); @@ -1757,7 +1777,7 @@ void __init kmemleak_init(void) #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { - atomic_set(&kmemleak_early_log, 0); + kmemleak_early_log = 0; kmemleak_disable(); return; } @@ -1768,7 +1788,6 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - INIT_PRIO_TREE_ROOT(&object_tree_root); if (crt_early_log >= ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " @@ -1776,12 +1795,12 @@ void __init kmemleak_init(void) /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - atomic_set(&kmemleak_early_log, 0); - if (atomic_read(&kmemleak_error)) { + kmemleak_early_log = 0; + if (kmemleak_error) { local_irq_restore(flags); return; } else - atomic_set(&kmemleak_enabled, 1); + kmemleak_enabled = 1; local_irq_restore(flags); /* @@ -1825,9 +1844,9 @@ void __init kmemleak_init(void) log->op_type); } - if (atomic_read(&kmemleak_warning)) { + if (kmemleak_warning) { print_log_trace(log); - atomic_set(&kmemleak_warning, 0); + kmemleak_warning = 0; } } } @@ -1839,9 +1858,9 @@ static int __init kmemleak_late_init(void) { struct dentry *dentry; - atomic_set(&kmemleak_initialized, 1); + kmemleak_initialized = 1; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately @@ -33,13 +33,22 @@ #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/tlbflush.h> #include "internal.h" +#ifdef CONFIG_NUMA +#define NUMA(x) (x) +#define DO_NUMA(x) do { (x); } while (0) +#else +#define NUMA(x) (0) +#define DO_NUMA(x) do { } while (0) +#endif + /* * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: @@ -78,6 +87,9 @@ * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) + * + * If the merge_across_nodes tunable is unset, then KSM maintains multiple + * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** @@ -113,19 +125,32 @@ struct ksm_scan { /** * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page - * @kpfn: page frame number of this ksm page + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { - struct rb_node node; + union { + struct rb_node node; /* when node of stable tree */ + struct { /* when listed for migration */ + struct list_head *head; + struct list_head list; + }; + }; struct hlist_head hlist; unsigned long kpfn; +#ifdef CONFIG_NUMA + int nid; +#endif }; /** * struct rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address @@ -135,7 +160,12 @@ struct stable_node { */ struct rmap_item { struct rmap_item *rmap_list; - struct anon_vma *anon_vma; /* when stable */ + union { + struct anon_vma *anon_vma; /* when stable */ +#ifdef CONFIG_NUMA + int nid; /* when node of unstable tree */ +#endif + }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ @@ -153,12 +183,16 @@ struct rmap_item { #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ -static struct rb_root root_stable_tree = RB_ROOT; -static struct rb_root root_unstable_tree = RB_ROOT; +static struct rb_root one_stable_tree[1] = { RB_ROOT }; +static struct rb_root one_unstable_tree[1] = { RB_ROOT }; +static struct rb_root *root_stable_tree = one_stable_tree; +static struct rb_root *root_unstable_tree = one_unstable_tree; + +/* Recently migrated nodes of stable tree, pending proper placement */ +static LIST_HEAD(migrate_nodes); -#define MM_SLOTS_HASH_SHIFT 10 -#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) -static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; +#define MM_SLOTS_HASH_BITS 10 +static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), @@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +#ifdef CONFIG_NUMA +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; +static int ksm_nr_node_ids = 1; +#else +#define ksm_merge_across_nodes 1U +#define ksm_nr_node_ids 1 +#endif + #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run = KSM_RUN_STOP; +#define KSM_RUN_OFFLINE 4 +static unsigned long ksm_run = KSM_RUN_STOP; +static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -275,31 +320,20 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) static struct mm_slot *get_mm_slot(struct mm_struct *mm) { - struct mm_slot *mm_slot; - struct hlist_head *bucket; - struct hlist_node *node; + struct mm_slot *slot; + + hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + if (slot->mm == mm) + return slot; - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; - hlist_for_each_entry(mm_slot, node, bucket, link) { - if (mm == mm_slot->mm) - return mm_slot; - } return NULL; } static void insert_to_mm_slots_hash(struct mm_struct *mm, struct mm_slot *mm_slot) { - struct hlist_head *bucket; - - bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; mm_slot->mm = mm; - hlist_add_head(&mm_slot->link, bucket); -} - -static inline int in_stable_tree(struct rmap_item *rmap_item) -{ - return rmap_item->address & STABLE_FLAG; + hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); } /* @@ -333,7 +367,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET); + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) @@ -410,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. @@ -447,12 +481,22 @@ out: page = NULL; return page; } +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ + return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; - struct hlist_node *hlist; - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else @@ -462,7 +506,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) cond_resched(); } - rb_erase(&stable_node->node, &root_stable_tree); + if (stable_node->head == &migrate_nodes) + list_del(&stable_node->list); + else + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); free_stable_node(stable_node); } @@ -472,6 +520,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for @@ -482,40 +531,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. - * - * include/linux/pagemap.h page_cache_get_speculative() is a good reference, - * but this is different - made simpler by ksm_thread_mutex being held, but - * interesting for assuming that no other use of the struct page could ever - * put our expected_mapping into page->mapping (or a field of the union which - * coincides with page->mapping). The RCU calls are not for KSM at all, but - * to keep the page_count protocol described with page_cache_get_speculative. - * - * Note: it is possible that get_ksm_page() will return NULL one moment, - * then page the next, if the page is in between page_freeze_refs() and - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node) +static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) { struct page *page; void *expected_mapping; + unsigned long kpfn; - page = pfn_to_page(stable_node->kpfn); expected_mapping = (void *)stable_node + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); - rcu_read_lock(); - if (page->mapping != expected_mapping) - goto stale; - if (!get_page_unless_zero(page)) +again: + kpfn = ACCESS_ONCE(stable_node->kpfn); + page = pfn_to_page(kpfn); + + /* + * page is computed from kpfn, so on most architectures reading + * page->mapping is naturally ordered after reading node->kpfn, + * but on Alpha we need to be more careful. + */ + smp_read_barrier_depends(); + if (ACCESS_ONCE(page->mapping) != expected_mapping) goto stale; - if (page->mapping != expected_mapping) { + + /* + * We cannot do anything with the page while its refcount is 0. + * Usually 0 means free, or tail of a higher-order page: in which + * case this node is no longer referenced, and should be freed; + * however, it might mean that the page is under page_freeze_refs(). + * The __remove_mapping() case is easy, again the node is now stale; + * but if page is swapcache in migrate_page_move_mapping(), it might + * still be our page, in which case it's essential to keep the node. + */ + while (!get_page_unless_zero(page)) { + /* + * Another check for page->mapping != expected_mapping would + * work here too. We have chosen the !PageSwapCache test to + * optimize the common case, when the page is or is about to + * be freed: PageSwapCache is cleared (under spin_lock_irq) + * in the freeze_refs section of __remove_mapping(); but Anon + * page->mapping reset to NULL later, in free_pages_prepare(). + */ + if (!PageSwapCache(page)) + goto stale; + cpu_relax(); + } + + if (ACCESS_ONCE(page->mapping) != expected_mapping) { put_page(page); goto stale; } - rcu_read_unlock(); + + if (lock_it) { + lock_page(page); + if (ACCESS_ONCE(page->mapping) != expected_mapping) { + unlock_page(page); + put_page(page); + goto stale; + } + } return page; + stale: - rcu_read_unlock(); + /* + * We come here from above when page->mapping or !PageSwapCache + * suggests that the node is stale; but it might be under migration. + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * before checking whether node->kpfn has been changed. + */ + smp_rmb(); + if (ACCESS_ONCE(stable_node->kpfn) != kpfn) + goto again; remove_node_from_stable_tree(stable_node); return NULL; } @@ -531,11 +617,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node); + page = get_ksm_page(stable_node, true); if (!page) goto out; - lock_page(page); hlist_del(&rmap_item->hlist); unlock_page(page); put_page(page); @@ -560,8 +645,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) - rb_erase(&rmap_item->node, &root_unstable_tree); - + rb_erase(&rmap_item->node, + root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } @@ -581,7 +666,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, } /* - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing @@ -614,6 +699,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, /* * Only called through the sysfs control interface: */ +static int remove_stable_node(struct stable_node *stable_node) +{ + struct page *page; + int err; + + page = get_ksm_page(stable_node, true); + if (!page) { + /* + * get_ksm_page did remove_node_from_stable_tree itself. + */ + return 0; + } + + if (WARN_ON_ONCE(page_mapped(page))) { + /* + * This should not happen: but if it does, just refuse to let + * merge_across_nodes be switched - there is no need to panic. + */ + err = -EBUSY; + } else { + /* + * The stable node did not yet appear stale to get_ksm_page(), + * since that allows for an unmapped ksm page to be recognized + * right up until it is freed; but the node is safe to remove. + * This page might be in a pagevec waiting to be freed, + * or it might be PageSwapCache (perhaps under writeback), + * or it might have been removed from swapcache a moment ago. + */ + set_page_stable_node(page, NULL); + remove_node_from_stable_tree(stable_node); + err = 0; + } + + unlock_page(page); + put_page(page); + return err; +} + +static int remove_all_stable_nodes(void) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + int nid; + int err = 0; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + while (root_stable_tree[nid].rb_node) { + stable_node = rb_entry(root_stable_tree[nid].rb_node, + struct stable_node, node); + if (remove_stable_node(stable_node)) { + err = -EBUSY; + break; /* proceed to next nid */ + } + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); + if (remove_stable_node(stable_node)) + err = -EBUSY; + cond_resched(); + } + return err; +} + static int unmerge_and_remove_all_rmap_items(void) { struct mm_slot *mm_slot; @@ -647,7 +797,7 @@ static int unmerge_and_remove_all_rmap_items(void) ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, struct mm_slot, mm_list); if (ksm_test_exit(mm)) { - hlist_del(&mm_slot->link); + hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); spin_unlock(&ksm_mmlist_lock); @@ -661,6 +811,8 @@ static int unmerge_and_remove_all_rmap_items(void) } } + /* Clean up stable nodes, but don't worry if some are still busy */ + remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; @@ -709,15 +861,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, spinlock_t *ptl; int swapped; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + goto out_mn; if (pte_write(*ptep) || pte_dirty(*ptep)) { pte_t entry; @@ -752,6 +911,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, out_unlock: pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -769,35 +930,31 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ addr = page_address_in_vma(page, vma); if (addr == -EFAULT) goto out; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); BUG_ON(pmd_trans_huge(*pmd)); - if (!pmd_present(*pmd)) - goto out; + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte_same(*ptep, orig_pte)) { pte_unmap_unlock(ptep, ptl); - goto out; + goto out_mn; } get_page(kpage); @@ -814,6 +971,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pte_unmap_unlock(ptep, ptl); err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return err; } @@ -939,6 +1098,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, if (err) goto out; + /* Unstable nid is in union with stable anon_vma: remove first */ + remove_rmap_item_from_tree(rmap_item); + /* Must get reference to anon_vma while still holding mmap_sem */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); @@ -989,42 +1151,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, */ static struct page *stable_tree_search(struct page *page) { - struct rb_node *node = root_stable_tree.rb_node; + int nid; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; struct stable_node *stable_node; + struct stable_node *page_node; - stable_node = page_stable_node(page); - if (stable_node) { /* ksm page forked */ + page_node = page_stable_node(page); + if (page_node && page_node->head != &migrate_nodes) { + /* ksm page forked */ get_page(page); return page; } - while (node) { + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_stable_tree + nid; +again: + new = &root->rb_node; + parent = NULL; + + while (*new) { struct page *tree_page; int ret; cond_resched(); - stable_node = rb_entry(node, struct stable_node, node); - tree_page = get_ksm_page(stable_node); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node, false); if (!tree_page) return NULL; ret = memcmp_pages(page, tree_page); + put_page(tree_page); - if (ret < 0) { - put_page(tree_page); - node = node->rb_left; - } else if (ret > 0) { - put_page(tree_page); - node = node->rb_right; - } else - return tree_page; + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * Lock and unlock the stable_node's page (which + * might already have been migrated) so that page + * migration is sure to notice its raised count. + * It would be more elegant to return stable_node + * than kpage, but that involves more changes. + */ + tree_page = get_ksm_page(stable_node, true); + if (tree_page) { + unlock_page(tree_page); + if (get_kpfn_nid(stable_node->kpfn) != + NUMA(stable_node->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; + } + /* + * There is now a place for page_node, but the tree may + * have been rebalanced, so re-evaluate parent and new. + */ + if (page_node) + goto again; + return NULL; + } } - return NULL; + if (!page_node) + return NULL; + + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_link_node(&page_node->node, parent, new); + rb_insert_color(&page_node->node, root); + get_page(page); + return page; + +replace: + if (page_node) { + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node->node, &page_node->node, root); + get_page(page); + } else { + rb_erase(&stable_node->node, root); + page = NULL; + } + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + return page; } /* - * stable_tree_insert - insert rmap_item pointing to new ksm page + * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, @@ -1032,17 +1251,25 @@ static struct page *stable_tree_search(struct page *page) */ static struct stable_node *stable_tree_insert(struct page *kpage) { - struct rb_node **new = &root_stable_tree.rb_node; + int nid; + unsigned long kpfn; + struct rb_root *root; + struct rb_node **new; struct rb_node *parent = NULL; struct stable_node *stable_node; + kpfn = page_to_pfn(kpage); + nid = get_kpfn_nid(kpfn); + root = root_stable_tree + nid; + new = &root->rb_node; + while (*new) { struct page *tree_page; int ret; cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node); + tree_page = get_ksm_page(stable_node, false); if (!tree_page) return NULL; @@ -1068,13 +1295,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) if (!stable_node) return NULL; - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, &root_stable_tree); - INIT_HLIST_HEAD(&stable_node->hlist); - - stable_node->kpfn = page_to_pfn(kpage); + stable_node->kpfn = kpfn; set_page_stable_node(kpage, stable_node); + DO_NUMA(stable_node->nid = nid); + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, root); return stable_node; } @@ -1097,10 +1323,15 @@ static struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, struct page *page, struct page **tree_pagep) - { - struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node **new; + struct rb_root *root; struct rb_node *parent = NULL; + int nid; + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_unstable_tree + nid; + new = &root->rb_node; while (*new) { struct rmap_item *tree_rmap_item; @@ -1130,6 +1361,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; + } else if (!ksm_merge_across_nodes && + page_to_nid(tree_page) != nid) { + /* + * If tree_page has been migrated to another NUMA node, + * it will be flushed out and put in the right unstable + * tree next time: only merge with it when across_nodes. + */ + put_page(tree_page); + return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; @@ -1138,8 +1378,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &root_unstable_tree); + rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; @@ -1181,10 +1422,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) unsigned int checksum; int err; - remove_rmap_item_from_tree(rmap_item); + stable_node = page_stable_node(page); + if (stable_node) { + if (stable_node->head != &migrate_nodes && + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + } + if (stable_node->head != &migrate_nodes && + rmap_item->head == stable_node) + return; + } /* We first start with searching the page inside the stable tree */ kpage = stable_tree_search(page); + if (kpage == page && rmap_item->head == stable_node) { + put_page(kpage); + return; + } + + remove_rmap_item_from_tree(rmap_item); + if (kpage) { err = try_to_merge_with_ksm_page(rmap_item, page, kpage); if (!err) { @@ -1218,14 +1478,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); put_page(tree_page); - /* - * As soon as we merge this page, we want to remove the - * rmap_item of the page we have merged with from the unstable - * tree, and insert it instead as new node in the stable tree. - */ if (kpage) { - remove_rmap_item_from_tree(tree_rmap_item); - + /* + * The pages were successfully merged: insert new + * node in the stable tree and add both rmap_items. + */ lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { @@ -1282,6 +1539,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + int nid; if (list_empty(&ksm_mm_head.mm_list)) return NULL; @@ -1300,7 +1558,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) */ lru_add_drain_all(); - root_unstable_tree = RB_ROOT; + /* + * Whereas stale stable_nodes on the stable_tree itself + * get pruned in the regular course of stable_tree_search(), + * those moved out to the migrate_nodes list can accumulate: + * so prune them once before each full scan. + */ + if (!ksm_merge_across_nodes) { + struct stable_node *stable_node; + struct list_head *this, *next; + struct page *page; + + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, + struct stable_node, list); + page = get_ksm_page(stable_node, false); + if (page) + put_page(page); + cond_resched(); + } + } + + for (nid = 0; nid < ksm_nr_node_ids; nid++) + root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); @@ -1385,7 +1665,7 @@ next_mm: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_sem then protects against race with MADV_MERGEABLE). */ - hlist_del(&slot->link); + hash_del(&slot->link); list_del(&slot->mm_list); spin_unlock(&ksm_mmlist_lock); @@ -1421,8 +1701,7 @@ static void ksm_do_scan(unsigned int scan_npages) rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; - if (!PageKsm(page) || !in_stable_tree(rmap_item)) - cmp_and_merge_page(page, rmap_item); + cmp_and_merge_page(page, rmap_item); put_page(page); } } @@ -1439,6 +1718,7 @@ static int ksm_scan_thread(void *nothing) while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); @@ -1469,10 +1749,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) @@ -1514,11 +1798,19 @@ int __ksm_enter(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); /* - * Insert just behind the scanning cursor, to let the area settle + * When KSM_RUN_MERGE (or KSM_RUN_STOP), + * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. + * + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its + * scanning cursor, otherwise KSM pages in newly forked mms will be + * missed: then we might as well insert at the end of the list. */ - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + if (ksm_run & KSM_RUN_UNMERGE) + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + else + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -1548,7 +1840,7 @@ void __ksm_exit(struct mm_struct *mm) mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hlist_del(&mm_slot->link); + hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); easy_to_free = 1; } else { @@ -1568,158 +1860,64 @@ void __ksm_exit(struct mm_struct *mm) } } -struct page *ksm_does_need_to_copy(struct page *page, +struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct anon_vma *anon_vma = page_anon_vma(page); struct page *new_page; + if (PageKsm(page)) { + if (page_stable_node(page) && + !(ksm_run & KSM_RUN_UNMERGE)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (anon_vma->root == vma->anon_vma->root && + page->index == linear_page_index(vma, address)) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); __SetPageUptodate(new_page); - SetPageSwapBacked(new_page); __set_page_locked(new_page); - - if (page_evictable(new_page, vma)) - lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); - else - add_page_to_unevictable_list(new_page); } return new_page; } -int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, - unsigned long *vm_flags) +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - struct hlist_node *hlist; - unsigned int mapcount = page_mapcount(page); - int referenced = 0; - int search_new_forks = 0; - - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return 0; -again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - - referenced += page_referenced_one(page, vma, - rmap_item->address, &mapcount, vm_flags); - if (!search_new_forks || !mapcount) - break; - } - anon_vma_unlock(anon_vma); - if (!mapcount) - goto out; - } - if (!search_new_forks++) - goto again; -out: - return referenced; -} - -int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) -{ - struct stable_node *stable_node; - struct hlist_node *hlist; - struct rmap_item *rmap_item; int ret = SWAP_AGAIN; int search_new_forks = 0; - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); - - stable_node = page_stable_node(page); - if (!stable_node) - return SWAP_FAIL; -again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { - vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) - continue; - /* - * Initially we examine only the vma which covers this - * rmap_item; but later, if there is still work to do, - * we examine covering vmas in other mms: in case they - * were forked from the original since ksmd passed. - */ - if ((rmap_item->mm == vma->vm_mm) == search_new_forks) - continue; - - ret = try_to_unmap_one(page, vma, - rmap_item->address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) { - anon_vma_unlock(anon_vma); - goto out; - } - } - anon_vma_unlock(anon_vma); - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - -#ifdef CONFIG_MIGRATION -int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) -{ - struct stable_node *stable_node; - struct hlist_node *hlist; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; + VM_BUG_ON_PAGE(!PageKsm(page), page); - VM_BUG_ON(!PageKsm(page)); - VM_BUG_ON(!PageLocked(page)); + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); stable_node = page_stable_node(page); if (!stable_node) return ret; again: - hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); - list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) @@ -1733,13 +1931,21 @@ again: if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; - ret = rmap_one(page, vma, rmap_item->address, arg); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); if (ret != SWAP_AGAIN) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); + goto out; + } + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); goto out; } } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; @@ -1747,76 +1953,128 @@ out: return ret; } +#ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { struct stable_node *stable_node; - VM_BUG_ON(!PageLocked(oldpage)); - VM_BUG_ON(!PageLocked(newpage)); - VM_BUG_ON(newpage->mapping != oldpage->mapping); + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); stable_node = page_stable_node(newpage); if (stable_node) { - VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); stable_node->kpfn = page_to_pfn(newpage); + /* + * newpage->mapping was set in advance; now we need smp_wmb() + * to make sure that the new stable_node->kpfn is visible + * to get_ksm_page() before it can see that oldpage->mapping + * has gone stale (or that PageSwapCache has been cleared). + */ + smp_wmb(); + set_page_stable_node(oldpage, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE -static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, - unsigned long end_pfn) +static int just_wait(void *word) { - struct rb_node *node; + schedule(); + return 0; +} - for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { - struct stable_node *stable_node; +static void wait_while_offlining(void) +{ + while (ksm_run & KSM_RUN_OFFLINE) { + mutex_unlock(&ksm_thread_mutex); + wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), + just_wait, TASK_UNINTERRUPTIBLE); + mutex_lock(&ksm_thread_mutex); + } +} - stable_node = rb_entry(node, struct stable_node, node); +static void ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + struct rb_node *node; + int nid; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + node = rb_first(root_stable_tree + nid); + while (node) { + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + node = rb_first(root_stable_tree + nid); + } else + node = rb_next(node); + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) - return stable_node; + remove_node_from_stable_tree(stable_node); + cond_resched(); } - return NULL; } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; - struct stable_node *stable_node; switch (action) { case MEM_GOING_OFFLINE: /* - * Keep it very simple for now: just lock out ksmd and - * MADV_UNMERGEABLE while any memory is going offline. - * mutex_lock_nested() is necessary because lockdep was alarmed - * that here we take ksm_thread_mutex inside notifier chain - * mutex, and later take notifier chain mutex inside - * ksm_thread_mutex to unlock it. But that's safe because both - * are inside mem_hotplug_mutex. + * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() + * and remove_all_stable_nodes() while memory is going offline: + * it is unsafe for them to touch the stable tree at this time. + * But unmerge_ksm_pages(), rmap lookups and other entry points + * which do not need the ksm_thread_mutex are all safe. */ - mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&ksm_thread_mutex); + ksm_run |= KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct - * pages which have been offlined: prune those from the tree. + * pages which have been offlined: prune those from the tree, + * otherwise get_ksm_page() might later try to access a + * non-existent struct page. */ - while ((stable_node = ksm_check_stable_tree(mn->start_pfn, - mn->start_pfn + mn->nr_pages)) != NULL) - remove_node_from_stable_tree(stable_node); + ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages); /* fallthrough */ case MEM_CANCEL_OFFLINE: + mutex_lock(&ksm_thread_mutex); + ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } +#else +static void wait_while_offlining(void) +{ +} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_SYSFS @@ -1843,7 +2101,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -1866,7 +2124,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long nr_pages; - err = strict_strtoul(buf, 10, &nr_pages); + err = kstrtoul(buf, 10, &nr_pages); if (err || nr_pages > UINT_MAX) return -EINVAL; @@ -1879,7 +2137,7 @@ KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_run); + return sprintf(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1888,7 +2146,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, int err; unsigned long flags; - err = strict_strtoul(buf, 10, &flags); + err = kstrtoul(buf, 10, &flags); if (err || flags > UINT_MAX) return -EINVAL; if (flags > KSM_RUN_UNMERGE) @@ -1902,15 +2160,13 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, */ mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - int oom_score_adj; - - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, - oom_score_adj); + clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; @@ -1926,6 +2182,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long knob; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_merge_across_nodes != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else if (root_stable_tree == one_stable_tree) { + struct rb_root *buf; + /* + * This is the first time that we switch away from the + * default of merging across nodes: must now allocate + * a buffer to hold as many roots as may be needed. + * Allocate stable and unstable together: + * MAXSMP NODES_SHIFT 10 will use 16kB. + */ + buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), + GFP_KERNEL); + /* Let us assume that RB_ROOT is NULL is zero */ + if (!buf) + err = -ENOMEM; + else { + root_stable_tree = buf; + root_unstable_tree = buf + nr_node_ids; + /* Stable tree is empty but not the unstable */ + root_unstable_tree[0] = one_unstable_tree[0]; + } + } + if (!err) { + ksm_merge_across_nodes = knob; + ksm_nr_node_ids = knob ? 1 : nr_node_ids; + } + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1980,6 +2294,9 @@ static struct attribute *ksm_attrs[] = { &pages_unshared_attr.attr, &pages_volatile_attr.attr, &full_scans_attr.attr, +#ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +#endif NULL, }; @@ -2018,10 +2335,7 @@ static int __init ksm_init(void) #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE - /* - * Choose a high priority since the callback takes ksm_thread_mutex: - * later callbacks could only be taking locks which nest within that. - */ + /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, 100); #endif return 0; @@ -2031,4 +2345,4 @@ out_free: out: return err; } -module_init(ksm_init) +subsys_initcall(ksm_init); diff --git a/mm/list_lru.c b/mm/list_lru.c new file mode 100644 index 000000000000..f1a0db194173 --- /dev/null +++ b/mm/list_lru.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. + * Authors: David Chinner and Glauber Costa + * + * Generic LRU infrastructure + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/list_lru.h> +#include <linux/slab.h> + +bool list_lru_add(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + WARN_ON_ONCE(nlru->nr_items < 0); + if (list_empty(item)) { + list_add_tail(item, &nlru->list); + if (nlru->nr_items++ == 0) + node_set(nid, lru->active_nodes); + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_add); + +bool list_lru_del(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + if (!list_empty(item)) { + list_del_init(item); + if (--nlru->nr_items == 0) + node_clear(nid, lru->active_nodes); + WARN_ON_ONCE(nlru->nr_items < 0); + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_del); + +unsigned long +list_lru_count_node(struct list_lru *lru, int nid) +{ + unsigned long count = 0; + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + WARN_ON_ONCE(nlru->nr_items < 0); + count += nlru->nr_items; + spin_unlock(&nlru->lock); + + return count; +} +EXPORT_SYMBOL_GPL(list_lru_count_node); + +unsigned long +list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, + void *cb_arg, unsigned long *nr_to_walk) +{ + + struct list_lru_node *nlru = &lru->node[nid]; + struct list_head *item, *n; + unsigned long isolated = 0; + + spin_lock(&nlru->lock); +restart: + list_for_each_safe(item, n, &nlru->list) { + enum lru_status ret; + + /* + * decrement nr_to_walk first so that we don't livelock if we + * get stuck on large numbesr of LRU_RETRY items + */ + if (!*nr_to_walk) + break; + --*nr_to_walk; + + ret = isolate(item, &nlru->lock, cb_arg); + switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); + case LRU_REMOVED: + if (--nlru->nr_items == 0) + node_clear(nid, lru->active_nodes); + WARN_ON_ONCE(nlru->nr_items < 0); + isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; + break; + case LRU_ROTATE: + list_move_tail(item, &nlru->list); + break; + case LRU_SKIP: + break; + case LRU_RETRY: + /* + * The lru lock has been dropped, our list traversal is + * now invalid and so we have to restart from scratch. + */ + assert_spin_locked(&nlru->lock); + goto restart; + default: + BUG(); + } + } + + spin_unlock(&nlru->lock); + return isolated; +} +EXPORT_SYMBOL_GPL(list_lru_walk_node); + +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) +{ + int i; + size_t size = sizeof(*lru->node) * nr_node_ids; + + lru->node = kzalloc(size, GFP_KERNEL); + if (!lru->node) + return -ENOMEM; + + nodes_clear(lru->active_nodes); + for (i = 0; i < nr_node_ids; i++) { + spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); + INIT_LIST_HEAD(&lru->node[i].list); + lru->node[i].nr_items = 0; + } + return 0; +} +EXPORT_SYMBOL_GPL(list_lru_init_key); + +void list_lru_destroy(struct list_lru *lru) +{ + kfree(lru->node); +} +EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..a402f8fdc68e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,8 +11,14 @@ #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/hugetlb.h> +#include <linux/falloc.h> #include <linux/sched.h> #include <linux/ksm.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/swapops.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -36,11 +42,11 @@ static int madvise_need_mmap_write(int behavior) * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, +static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; int error = 0; pgoff_t pgoff; unsigned long new_flags = vma->vm_flags; @@ -66,10 +72,14 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags &= ~VM_DONTCOPY; break; case MADV_DONTDUMP: - new_flags |= VM_NODUMP; + new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - new_flags &= ~VM_NODUMP; + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -124,15 +134,105 @@ out: return error; } +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + pte_t *orig_pte; + struct vm_area_struct *vma = walk->private; + unsigned long index; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + pte = *(orig_pte + ((index - start) / PAGE_SIZE)); + pte_unmap_unlock(orig_pte, ptl); + + if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index); + if (page) + page_cache_release(page); + } + + return 0; +} + +static void force_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_walk walk = { + .mm = vma->vm_mm, + .pmd_entry = swapin_walk_pmd_entry, + .private = vma, + }; + + walk_page_range(start, end, &walk); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + pgoff_t index; + struct page *page; + swp_entry_t swap; + + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + page = find_get_entry(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); + continue; + } + swap = radix_to_swp_entry(page); + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0); + if (page) + page_cache_release(page); + } + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; +#ifdef CONFIG_SWAP + if (!file || mapping_cap_swap_backed(file->f_mapping)) { + *prev = vma; + if (!file) + force_swapin_readahead(vma, start, end); + else + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#endif + if (!file) return -EBADF; @@ -170,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; @@ -200,33 +300,39 @@ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - struct address_space *mapping; - loff_t offset, endoff; + loff_t offset; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) return -EACCES; - mapping = vma->vm_file->f_mapping; - offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - endoff = (loff_t)(end - vma->vm_start - 1) - + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex */ + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); - error = vmtruncate_range(mapping->host, offset, endoff); + error = do_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); + fput(f); down_read(¤t->mm->mmap_sem); return error; } @@ -237,29 +343,35 @@ static long madvise_remove(struct vm_area_struct *vma, */ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { - int ret = 0; - + struct page *p; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE) { - struct page *p; - int ret = get_user_pages_fast(start, 1, 0, &p); + for (; start < end; start += PAGE_SIZE << + compound_order(compound_head(p))) { + int ret; + + ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + + if (PageHWPoison(p)) { + put_page(p); + continue; + } if (bhv == MADV_SOFT_OFFLINE) { - printk(KERN_INFO "Soft offlining page %lx at %lx\n", + pr_info("Soft offlining page %#lx at %#lx\n", page_to_pfn(p), start); ret = soft_offline_page(p, MF_COUNT_INCREASED); if (ret) - break; + return ret; continue; } - printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); /* Ignore return value for now */ memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } - return ret; + return 0; } #endif @@ -353,11 +465,12 @@ madvise_behavior_valid(int behavior) SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; - struct vm_area_struct * vma, *prev; + struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; int write; size_t len; + struct blk_plug plug; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -366,27 +479,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else - down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) - goto out; + return error; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - goto out; + return error; end = start + len; if (end < start) - goto out; + return error; error = 0; if (end == start) - goto out; + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); /* * If the interval [start,end) covers some unmapped address @@ -397,6 +510,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (vma && start > vma->vm_start) prev = vma; + blk_start_plug(&plug); for (;;) { /* Still start < end. */ error = -ENOMEM; @@ -432,6 +546,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) vma = find_vma(current->mm, start); } out: + blk_finish_plug(&plug); if (write) up_write(¤t->mm->mmap_sem); else diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f8..e9d6ca9a01a9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,6 +20,11 @@ #include <linux/seq_file.h> #include <linux/memblock.h> +#include <asm-generic/sections.h> +#include <linux/io.h> + +#include "internal.h" + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -32,14 +37,21 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; /* inline so we don't get a warning when pr_debug is compiled out */ -static inline const char *memblock_type_name(struct memblock_type *type) +static __init_memblock const char * +memblock_type_name(struct memblock_type *type) { if (type == &memblock.memory) return "memory"; @@ -79,33 +91,57 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } -/** - * memblock_find_in_range_node - find free area in given range and node +/* + * __memblock_find_range_bottom_up - find free area utility in bottom-up * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Find @size free area aligned to @align in the specified range and node. + * Utility called from memblock_find_in_range_node(), find free area bottom-up. * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +static phys_addr_t __init_memblock +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) { phys_addr_t this_start, this_end, cand; u64 i; - /* pump up @end */ - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) - end = memblock.current_limit; + for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); - /* avoid allocating the first page */ - start = max_t(phys_addr_t, start, PAGE_SIZE); - end = max(start, end); + cand = round_up(this_start, align); + if (cand < this_end && this_end - cand >= size) + return cand; + } + + return 0; +} + +/** + * __memblock_find_range_top_down - find free area utility, in top-down + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Utility called from memblock_find_in_range_node(), find free area top-down. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); @@ -118,10 +154,81 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, if (cand >= this_start) return cand; } + return 0; } /** + * memblock_find_in_range_node - find free area in given range and node + * @size: size of free area to find + * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Find @size free area aligned to @align in the specified range and node. + * + * When allocation direction is bottom-up, the @start should be greater + * than the end of the kernel image. Otherwise, it will be trimmed. The + * reason is that we want the bottom-up allocation just near the kernel + * image so it is highly likely that the allocated memory and the kernel + * will reside in the same node. + * + * If bottom-up allocation failed, will try to allocate memory top-down. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) +{ + int ret; + phys_addr_t kernel_end; + + /* pump up @end */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); + end = max(start, end); + kernel_end = __pa_symbol(_end); + + /* + * try bottom-up allocation only when bottom-up mode + * is set and @end is above the kernel image. + */ + if (memblock_bottom_up() && end > kernel_end) { + phys_addr_t bottom_up_start; + + /* make sure we will allocate above the kernel */ + bottom_up_start = max(start, kernel_end); + + /* ok, try bottom-up allocation first */ + ret = __memblock_find_range_bottom_up(bottom_up_start, end, + size, align, nid); + if (ret) + return ret; + + /* + * we always limit bottom-up allocation above the kernel, + * but top-down allocation doesn't have the limit, so + * retrying top-down allocation may succeed when bottom-up + * allocation failed. + * + * bottom-up allocation is expected to be fail very rarely, + * so we use WARN_ONCE() here to see the stack trace if + * fail happens. + */ + WARN_ONCE(1, "memblock: bottom-up allocation failed, " + "memory hotunplug may be affected\n"); + } + + return __memblock_find_range_top_down(start, end, size, align, nid); +} + +/** * memblock_find_in_range - find free area in given range * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} @@ -131,38 +238,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, * Find @size free area aligned to @align in the specified range. * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(start, end, size, align, - MAX_NUMNODES); -} - -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); + return memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE); } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -178,15 +261,63 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + type->regions[0].flags = 0; memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } -static int __init_memblock memblock_double_array(struct memblock_type *type) +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + +phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( + phys_addr_t *addr) +{ + if (memblock.memory.regions == memblock_memory_init_regions) + return 0; + + *addr = __pa(memblock.memory.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); +} + +#endif + +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); + int *in_slab; /* We don't allow resizing until we know about the reserved regions * of memory that aren't suitable for allocation @@ -197,36 +328,62 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); + + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; /* Try to find some space for it. * * WARNING: We assume that either slab_is_available() and we use it or - * we use MEMBLOCK for allocations. That means that this is unsafe to use - * when bootmem is currently active (unless bootmem itself is implemented - * on top of MEMBLOCK which isn't the case yet) + * we use MEMBLOCK for allocations. That means that this is unsafe to + * use when bootmem is currently active (unless bootmem itself is + * implemented on top of MEMBLOCK which isn't the case yet) * * This should however not be an issue for now, as we currently only - * call into MEMBLOCK while it's still active, or much later when slab is - * active for memory hotplug operations + * call into MEMBLOCK while it's still active, or much later when slab + * is active for memory hotplug operations */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); addr = new_array ? __pa(new_array) : 0; - } else - addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + } else { + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_alloc_size, PAGE_SIZE); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); + + new_array = addr ? __va(addr) : NULL; + } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; } - new_array = __va(addr); - memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", - memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); + memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, + (u64)addr + new_size - 1); - /* Found space, we now need to move the array over before - * we add the reserved region since it may be our reserved - * array itself that is full. + /* + * Found space, we now need to move the array over before we add the + * reserved region since it may be our reserved array itself that is + * full. */ memcpy(new_array, type->regions, old_size); memset(new_array + type->max, 0, old_size); @@ -234,21 +391,22 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) type->regions = new_array; type->max <<= 1; - /* If we use SLAB that's it, we are done */ - if (use_slab) - return 0; - - /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_reserve(addr, new_size)); + /* Free old array. We needn't free it if the array is the static one */ + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_alloc_size); - /* If the array wasn't our static init one, then free it. We only do - * that before SLAB is available as later on, we don't know whether - * to use kfree or free_bootmem_pages(). Shouldn't be a big deal - * anyways + /* + * Reserve the new array if that comes from the memblock. Otherwise, we + * needn't do it */ - if (old_array != memblock_memory_init_regions && - old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_alloc_size)); + + /* Update slab flag */ + *in_slab = use_slab; return 0; } @@ -270,31 +428,36 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) if (this->base + this->size != next->base || memblock_get_region_node(this) != - memblock_get_region_node(next)) { + memblock_get_region_node(next) || + this->flags != next->flags) { BUG_ON(this->base + this->size > next->base); i++; continue; } this->size += next->size; - memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); type->cnt--; } } /** * memblock_insert_region - insert new memblock region - * @type: memblock type to insert into - * @idx: index for the insertion point - * @base: base address of the new region - * @size: size of the new region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * @nid: node id of the new region + * @flags: flags of the new region * * Insert new memblock region [@base,@base+@size) into @type at @idx. * @type must already have extra room to accomodate the new region. */ static void __init_memblock memblock_insert_region(struct memblock_type *type, int idx, phys_addr_t base, - phys_addr_t size, int nid) + phys_addr_t size, + int nid, unsigned long flags) { struct memblock_region *rgn = &type->regions[idx]; @@ -302,6 +465,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); rgn->base = base; rgn->size = size; + rgn->flags = flags; memblock_set_region_node(rgn, nid); type->cnt++; type->total_size += size; @@ -313,6 +477,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * @base: base address of the new region * @size: size of the new region * @nid: nid of the new region + * @flags: flags of the new region * * Add new memblock region [@base,@base+@size) into @type. The new region * is allowed to overlap with existing ones - overlaps don't affect already @@ -323,7 +488,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * 0 on success, -errno on failure. */ static int __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, int nid) + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) { bool insert = false; phys_addr_t obase = base; @@ -338,6 +504,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; return 0; @@ -368,7 +535,8 @@ repeat: nr_new++; if (insert) memblock_insert_region(type, i++, base, - rbase - base, nid); + rbase - base, nid, + flags); } /* area below @rend is dealt with, forget about it */ base = min(rend, end); @@ -378,7 +546,8 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, nid); + memblock_insert_region(type, i, base, end - base, + nid, flags); } /* @@ -387,7 +556,7 @@ repeat: */ if (!insert) { while (type->cnt + nr_new > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, obase, size) < 0) return -ENOMEM; insert = true; goto repeat; @@ -400,12 +569,13 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid); + return memblock_add_region(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); + return memblock_add_region(&memblock.memory, base, size, + MAX_NUMNODES, 0); } /** @@ -438,7 +608,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, /* we'll create at most two more regions */ while (type->cnt + 2 > type->max) - if (memblock_double_array(type) < 0) + if (memblock_double_array(type, base, size) < 0) return -ENOMEM; for (i = 0; i < type->cnt; i++) { @@ -460,7 +630,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= base - rbase; type->total_size -= base - rbase; memblock_insert_region(type, i, rbase, base - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else if (rend > end) { /* * @rgn intersects from above. Split and redo the @@ -470,7 +641,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->size -= end - rbase; type->total_size -= end - rbase; memblock_insert_region(type, i--, rbase, end - rbase, - memblock_get_region_node(rgn)); + memblock_get_region_node(rgn), + rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) @@ -506,31 +678,92 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", (unsigned long long)base, - (unsigned long long)base + size, + (unsigned long long)base + size - 1, (void *)_RET_IP_); return __memblock_remove(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) { struct memblock_type *_rgn = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, - (unsigned long long)base + size, - (void *)_RET_IP_); + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); - return memblock_add_region(_rgn, base, size, MAX_NUMNODES); + return memblock_add_region(_rgn, base, size, nid, flags); +} + +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); +} + +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and mark it with flag + * MEMBLOCK_HOTPLUG. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * This function isolates region [@base, @base + @size), and clear flag + * MEMBLOCK_HOTPLUG for the isolated regions. + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_clear_region_flags(&type->regions[i], + MEMBLOCK_HOTPLUG); + + memblock_merge_regions(type); + return 0; } /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @nid: node selector, %NUMA_NO_NODE for all nodes + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Find the first free area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of @@ -556,13 +789,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, int mi = *idx & 0xffffffff; int ri = *idx >> 32; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + for ( ; mi < mem->cnt; mi++) { struct memblock_region *m = &mem->regions[mi]; phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) continue; /* scan areas before each reservation for intersection */ @@ -603,12 +839,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, /** * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * @p_nid: ptr to int for nid of the range, can be %NULL + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL * * Reverse of __next_free_mem_range(). + * + * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't + * be able to hot-remove hotpluggable memory used by the kernel. So this + * function skip hotpluggable regions if needed when allocating memory for the + * kernel. */ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, @@ -619,6 +860,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, int mi = *idx & 0xffffffff; int ri = *idx >> 32; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + if (*idx == (u64)ULLONG_MAX) { mi = mem->cnt - 1; ri = rsv->cnt; @@ -630,7 +874,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t m_end = m->base + m->size; /* only memory regions are associated with nodes, check it */ - if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + continue; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; /* scan areas before each reservation for intersection */ @@ -700,18 +948,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for * @size: size of area to set node ID for + * @type: memblock type to set node ID for * @nid: node ID to set * - * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. * Regions which cross the area boundaries are split as necessary. * * RETURNS: * 0 on success, -errno on failure. */ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, - int nid) + struct memblock_type *type, int nid) { - struct memblock_type *type = &memblock.memory; int start_rgn, end_rgn; int i, ret; @@ -720,7 +968,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return ret; for (i = start_rgn; i < end_rgn; i++) - type->regions[i].nid = nid; + memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); return 0; @@ -733,10 +981,10 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, { phys_addr_t found; - /* align @size to avoid excessive fragmentation on reserved array */ - size = round_up(size, align); + if (!align) + align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(0, max_addr, size, align, nid); + found = memblock_find_in_range_node(size, align, 0, max_addr, nid); if (found && !memblock_reserve(found, size)) return found; @@ -750,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -780,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + __memblock_remove(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} /* * Remaining API functions @@ -790,6 +1239,23 @@ phys_addr_t __init memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) +{ + unsigned long pages = 0; + struct memblock_region *r; + unsigned long start_pfn, end_pfn; + + for_each_memblock(memory, r) { + start_pfn = memblock_region_memory_base_pfn(r); + end_pfn = memblock_region_memory_end_pfn(r); + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + pages += end_pfn - start_pfn; + } + + return PFN_PHYS(pages); +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { @@ -805,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - unsigned long i; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; if (!limit) return; /* find out max address */ - for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - + for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -855,6 +1319,34 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int __init_memblock memblock_search_pfn_nid(unsigned long pfn, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + struct memblock_type *type = &memblock.memory; + int mid = memblock_search(type, PFN_PHYS(pfn)); + + if (mid == -1) + return -1; + + *start_pfn = type->regions[mid].base >> PAGE_SHIFT; + *end_pfn = (type->regions[mid].base + type->regions[mid].size) + >> PAGE_SHIFT; + + return type->regions[mid].nid; +} +#endif + +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); @@ -867,21 +1359,61 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + phys_addr_t start, end, orig_start, orig_end; + struct memblock_region *r; + + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + r->base = start; + r->size = end - start; + } else { + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; + } + } +} void __init_memblock memblock_set_current_limit(phys_addr_t limit) { memblock.current_limit = limit; } +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; + unsigned long flags; int i; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); @@ -892,13 +1424,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name base = rgn->base; size = rgn->size; + flags = rgn->flags; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", - name, i, base, base + size - 1, size, nid_buf); + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7685d4a0b3ce..5177c6d4a2dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -10,6 +10,10 @@ * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -41,65 +45,81 @@ #include <linux/swapops.h> #include <linux/spinlock.h> #include <linux/eventfd.h> +#include <linux/poll.h> #include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> -#include <linux/vmalloc.h> +#include <linux/vmpressure.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/lockdep.h> +#include <linux/file.h> #include "internal.h" #include <net/sock.h> +#include <net/ip.h> #include <net/tcp_memcontrol.h> +#include "slab.h" #include <asm/uaccess.h> #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); + #define MEM_CGROUP_RECLAIM_RETRIES 5 -struct mem_cgroup *root_mem_cgroup __read_mostly; +static struct mem_cgroup *root_mem_cgroup __read_mostly; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; /* for remember boot option*/ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +#ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else static int really_do_swap_account __initdata = 0; #endif #else -#define do_swap_account (0) +#define do_swap_account 0 #endif -/* - * Statistics for memory cgroup. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ - MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - MEM_CGROUP_STAT_NSTATS, +static const char * const mem_cgroup_stat_names[] = { + "cache", + "rss", + "rss_huge", + "mapped_file", + "writeback", + "swap", }; enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; + +static const char * const mem_cgroup_events_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + +static const char * const mem_cgroup_lru_names[] = { + "inactive_anon", + "active_anon", + "inactive_file", + "active_file", + "unevictable", +}; + /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremated by the number of pages. This counter is used for @@ -112,19 +132,25 @@ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; -#define THRESHOLDS_EVENTS_TARGET (128) -#define SOFTLIMIT_EVENTS_TARGET (1024) -#define NUMAINFO_EVENTS_TARGET (1024) +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 +#define NUMAINFO_EVENTS_TARGET 1024 struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* + * last scanned hierarchy member. Valid only if last_dead_count + * matches memcg->dead_count of the hierarchy root group. + */ + struct mem_cgroup *last_visited; + int last_dead_count; + /* scan generation, increased every round-trip */ unsigned int generation; }; @@ -138,7 +164,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct zone_reclaim_stat reclaim_stat; struct rb_node tree_node; /* RB tree node */ unsigned long long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -151,10 +176,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -182,7 +203,7 @@ struct mem_cgroup_threshold { /* For threshold */ struct mem_cgroup_threshold_ary { - /* An array index points to threshold just below usage. */ + /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; @@ -207,6 +228,46 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -228,49 +289,27 @@ struct mem_cgroup { */ struct res_counter res; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; + /* vmpressure notifications */ + struct vmpressure vmpressure; - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * But when using vfree(), that cannot be done at - * interrupt time, so we must then queue the work. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. + * the counter to account for kernel memory usage. */ - struct mem_cgroup_lru_info info; - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif + struct res_counter kmem; /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; + unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ bool oom_lock; atomic_t under_oom; - - atomic_t refcnt; + atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -295,7 +334,7 @@ struct mem_cgroup { * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ - unsigned long move_charge_at_immigrate; + unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -305,7 +344,7 @@ struct mem_cgroup { /* * percpu counter. */ - struct mem_cgroup_stat_cpu *stat; + struct mem_cgroup_stat_cpu __percpu *stat; /* * used when a cpu is offlined or other synchronizations * See mem_cgroup_read_stat(). @@ -313,15 +352,73 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; -#ifdef CONFIG_INET - struct tcp_memcontrol tcp_mem; + atomic_t dead_count; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) + struct cg_proto tcp_mem; +#endif +#if defined(CONFIG_MEMCG_KMEM) + /* analogous to slab_common's slab_caches list. per-memcg */ + struct list_head memcg_slab_caches; + /* Not a spinlock, we can take a lot of time walking the list */ + struct mutex slab_caches_mutex; + /* Index in the kmem_cache->memcg_params->memcg_caches array */ + int kmemcg_id; +#endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; #endif + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ +}; + +/* internal only representation about the status of kmem accounting. */ +enum { + KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ + KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; +#ifdef CONFIG_MEMCG_KMEM +static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) +{ + set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); +} + +static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) +{ + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); + if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) + set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); +} + +static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) +{ + return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, + &memcg->kmem_account_flags); +} +#endif + /* Stuffs for move charges at task migration. */ /* - * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a - * left-shifted bitmap of these types. + * Types of charges to be moved. "move_charge_at_immitgrate" and + * "immigrate_flags" are treated as a left-shifted bitmap of these types. */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ @@ -334,6 +431,7 @@ static struct move_charge_struct { spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; + unsigned long immigrate_flags; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; @@ -346,39 +444,39 @@ static struct move_charge_struct { static bool move_anon(void) { - return test_bit(MOVE_CHARGE_TYPE_ANON, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); } static bool move_file(void) { - return test_bit(MOVE_CHARGE_TYPE_FILE, - &mc.to->move_charge_at_immigrate); + return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); } /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. */ -#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, - MEM_CGROUP_CHARGE_TYPE_MAPPED, - MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ - MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + MEM_CGROUP_CHARGE_TYPE_ANON, MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, }; /* for encoding cft->private value on file */ -#define _MEM (0) -#define _MEMSWAP (1) -#define _OOM_TYPE (2) -#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) -#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + +#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) +#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Used for OOM nofiier */ #define OOM_CONTROL (0) @@ -391,19 +489,67 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) -static void mem_cgroup_get(struct mem_cgroup *memcg); -static void mem_cgroup_put(struct mem_cgroup *memcg); +/* + * The memcg_create_mutex will be held whenever a new cgroup is created. + * As a consequence, any change that needs to protect against new child cgroups + * appearing has to hold it as well. + */ +static DEFINE_MUTEX(memcg_create_mutex); + +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return s ? container_of(s, struct mem_cgroup, css) : NULL; +} + +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +{ + return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; +} + +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + /* + * The ID of the root cgroup is 0, but memcg treat 0 as an + * invalid ID, so we return (cgroup_id + 1). + */ + return memcg->css.cgroup->id + 1; +} + +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id - 1, &memory_cgrp_subsys); + return mem_cgroup_from_css(css); +} /* Writing them here to avoid exposing memcg's inner layout */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM -#include <net/sock.h> -#include <net/ip.h> +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) -static bool mem_cgroup_is_root(struct mem_cgroup *memcg); void sock_update_memcg(struct sock *sk) { if (mem_cgroup_sockets_enabled) { struct mem_cgroup *memcg; + struct cg_proto *cg_proto; BUG_ON(!sk->sk_prot->proto_cgroup); @@ -417,15 +563,16 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (!mem_cgroup_is_root(memcg)) { - mem_cgroup_get(memcg); - sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); + cg_proto = sk->sk_prot->proto_cgroup(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { + sk->sk_cgrp = cg_proto; } rcu_read_unlock(); } @@ -438,28 +585,102 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } -#ifdef CONFIG_INET struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) { if (!memcg || mem_cgroup_is_root(memcg)) return NULL; - return &memcg->tcp_mem.cg_proto; + return &memcg->tcp_mem; } EXPORT_SYMBOL(tcp_proto_cgroup); -#endif /* CONFIG_INET */ -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ + +static void disarm_sock_keys(struct mem_cgroup *memcg) +{ + if (!memcg_proto_activated(&memcg->tcp_mem)) + return; + static_key_slow_dec(&memcg_socket_limit_enabled); +} +#else +static void disarm_sock_keys(struct mem_cgroup *memcg) +{ +} +#endif + +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * The current size of the caches array is stored in + * memcg_limited_groups_array_size. It will double each time we have to + * increase it. + */ +static DEFINE_IDA(kmem_limited_groups); +int memcg_limited_groups_array_size; + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * cgrp_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ + if (memcg_kmem_is_active(memcg)) { + static_key_slow_dec(&memcg_kmem_enabled_key); + ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); + } + /* + * This check can't live in kmem destruction function, + * since the charges will outlive the cgroup + */ + WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); +} +#else +static void disarm_kmem_keys(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static void disarm_static_keys(struct mem_cgroup *memcg) +{ + disarm_sock_keys(memcg); + disarm_kmem_keys(memcg); +} static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + VM_BUG_ON((unsigned)nid >= nr_node_ids); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -675,7 +896,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, @@ -684,6 +905,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; + get_online_cpus(); for_each_online_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -691,14 +913,14 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, val += memcg->nocpu_base.events[idx]; spin_unlock(&memcg->pcp_counter_lock); #endif + put_online_cpus(); return val; } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + struct page *page, bool anon, int nr_pages) { - preempt_disable(); - /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. @@ -710,6 +932,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + if (PageTransHuge(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + nr_pages); + /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); @@ -718,12 +944,19 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); - - preempt_enable(); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } unsigned long +mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + return mz->lru_size[lru]; +} + +static unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { @@ -760,7 +993,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, int nid; u64 total = 0; - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } @@ -770,7 +1003,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = __this_cpu_read(memcg->stat->nr_page_events); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ if ((long)next - (long)val < 0) { @@ -825,13 +1058,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -{ - return container_of(cgroup_subsys_state(cont, - mem_cgroup_subsys_id), struct mem_cgroup, - css); -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -842,31 +1068,136 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return container_of(task_subsys_state(p, mem_cgroup_subsys_id), - struct mem_cgroup, css); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; - if (!mm) - return NULL; - /* - * Because we have no locks, mm->owner's may be being moved to other - * cgroup. We use css_tryget() here even if this looks - * pessimistic (rather than adding locks here). - */ rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - break; + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; } +/* + * Returns a next (in a pre-order walk) alive memcg (with elevated css + * ref. count) or NULL if the whole root's subtree has been visited. + * + * helper function to be used by mem_cgroup_iter + */ +static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, + struct mem_cgroup *last_visited) +{ + struct cgroup_subsys_state *prev_css, *next_css; + + prev_css = last_visited ? &last_visited->css : NULL; +skip_node: + next_css = css_next_descendant_pre(prev_css, &root->css); + + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + * + * We do not take a reference on the root of the tree walk + * because we might race with the root removal when it would + * be the only node in the iterated hierarchy and mem_cgroup_iter + * would end up in an endless loop because it expects that at + * least one valid node will be returned. Root cannot disappear + * because caller of the iterator should hold it already so + * skipping css reference should be safe. + */ + if (next_css) { + if ((next_css == &root->css) || + ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) + return mem_cgroup_from_css(next_css); + + prev_css = next_css; + goto skip_node; + } + + return NULL; +} + +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) +{ + /* + * When a group in the hierarchy below root is destroyed, the + * hierarchy iterator can no longer be trusted since it might + * have pointed to the destroyed group. Invalidate it. + */ + atomic_inc(&root->dead_count); +} + +static struct mem_cgroup * +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *root, + int *sequence) +{ + struct mem_cgroup *position = NULL; + /* + * A cgroup destruction happens in two stages: offlining and + * release. They are separated by a RCU grace period. + * + * If the iterator is valid, we may still race with an + * offlining. The RCU lock ensures the object won't be + * released, tryget will fail if we lost the race. + */ + *sequence = atomic_read(&root->dead_count); + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; + + /* + * We cannot take a reference to root because we might race + * with root removal and returning NULL would end up in + * an endless loop on the iterator user level when root + * would be returned all the time. + */ + if (position && position != root && + !css_tryget(&position->css)) + position = NULL; + } + return position; +} + +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, + struct mem_cgroup *root, + int sequence) +{ + /* root reference counting symmetric to mem_cgroup_iter_load */ + if (last_visited && last_visited != root) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was + * loaded successfully instead of rereading it here so that we + * don't lose destruction events in between. We could have + * raced with the destruction of @new_position after all. + */ + iter->last_visited = new_position; + smp_wmb(); + iter->last_dead_count = sequence; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -889,7 +1220,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -898,20 +1229,18 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - id = css_id(&prev->css); - - if (prev && prev != root) - css_put(&prev->css); + last_visited = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - return NULL; + goto out_css_put; return root; } + rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + int uninitialized_var(seq); if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -920,32 +1249,35 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) - return NULL; - id = iter->position; + if (prev && reclaim->generation != iter->generation) { + iter->last_visited = NULL; + goto out_unlock; + } + + last_visited = mem_cgroup_iter_load(iter, root, &seq); } - rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); - if (css) { - if (css == &root->css || css_tryget(css)) - memcg = container_of(css, - struct mem_cgroup, css); - } else - id = 0; - rcu_read_unlock(); + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - iter->position = id; - if (!css) + mem_cgroup_iter_update(iter, last_visited, memcg, root, + seq); + + if (!memcg) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; } - if (prev && !css) - return NULL; + if (prev && !memcg) + goto out_unlock; } +out_unlock: + rcu_read_unlock(); +out_css_put: + if (prev && prev != root) + css_put(&prev->css); + return memcg; } @@ -978,18 +1310,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) -{ - return (memcg == root_mem_cgroup); -} - -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; - if (!mm) - return; - rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) @@ -1008,12 +1332,12 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) out: rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_count_vm_event); +EXPORT_SYMBOL(__mem_cgroup_count_vm_event); /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg * @zone: zone of the wanted lruvec - * @mem: memcg of the wanted lruvec + * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for the given @zone and * @mem. This can be the global zone lruvec, if the memory controller @@ -1023,12 +1347,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, struct mem_cgroup *memcg) { struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /* @@ -1046,32 +1382,27 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, */ /** - * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec - * @zone: zone of the page + * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page - * @lru: current lru - * - * This function accounts for @page being added to @lru, and returns - * the lruvec for the given @zone and the memcg @page is charged to. - * - * The callsite is then responsible for physically linking the page to - * the returned lruvec->lists[@lru]. + * @zone: zone of the page */ -struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, - enum lru_list lru) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct page_cgroup *pc; + struct lruvec *lruvec; - if (mem_cgroup_disabled()) - return &zone->lruvec; + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; /* - * Surreptitiously switch any uncharged page to root: + * Surreptitiously switch any uncharged offlist page to root: * an uncharged page off lru does nothing to secure * its former mem_cgroup from sudden removal. * @@ -1079,96 +1410,81 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, * under page_cgroup lock: between them, they make all uses * of pc->mem_cgroup safe. */ - if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; mz = page_cgroup_zoneinfo(memcg, page); - /* compound_order() is stabilized through lru_lock */ - mz->lru_size[lru] += 1 << compound_order(page); - return &mz->lruvec; + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; } /** - * mem_cgroup_lru_del_list - account for removing an lru page - * @page: the page - * @lru: target lru - * - * This function accounts for @page being removed from @lru. + * mem_cgroup_update_lru_size - account for adding or removing an lru page + * @lruvec: mem_cgroup per zone lru vector + * @lru: index of lru list the page is sitting on + * @nr_pages: positive when adding or negative when removing * - * The callsite is then responsible for physically unlinking - * @page->lru. + * This function must be called when a page is added to or removed from an + * lru list. */ -void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) +void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, + int nr_pages) { struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - struct page_cgroup *pc; + unsigned long *lru_size; if (mem_cgroup_disabled()) return; - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - VM_BUG_ON(!memcg); - mz = page_cgroup_zoneinfo(memcg, page); - /* huge page split is done under lru_lock. so, we have no races. */ - VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); - mz->lru_size[lru] -= 1 << compound_order(page); -} - -void mem_cgroup_lru_del(struct page *page) -{ - mem_cgroup_lru_del_list(page, page_lru(page)); -} - -/** - * mem_cgroup_lru_move_lists - account for moving a page between lrus - * @zone: zone of the page - * @page: the page - * @from: current lru - * @to: target lru - * - * This function accounts for @page being moved between the lrus @from - * and @to, and returns the lruvec for the given @zone and the memcg - * @page is charged to. - * - * The callsite is then responsible for physically relinking - * @page->lru to the returned lruvec->lists[@to]. - */ -struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, - struct page *page, - enum lru_list from, - enum lru_list to) -{ - /* XXX: Optimize this, especially for @from == @to */ - mem_cgroup_lru_del_list(page, from); - return mem_cgroup_lru_add_list(zone, page, to); + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + lru_size = mz->lru_size + lru; + *lru_size += nr_pages; + VM_BUG_ON((long)(*lru_size) < 0); } /* * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ +bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) +{ + if (root_memcg == memcg) + return true; + if (!root_memcg->use_hierarchy || !memcg) + return false; + return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); +} + static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { - if (root_memcg != memcg) { - return (root_memcg->use_hierarchy && - css_is_ancestor(&memcg->css, &root_memcg->css)); - } + bool ret; - return true; + rcu_read_lock(); + ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); + rcu_read_unlock(); + return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1176,14 +1492,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } - if (!curr) - return 0; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1195,19 +1509,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) return ret; } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) +int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { unsigned long inactive_ratio; - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); unsigned long inactive; unsigned long active; unsigned long gb; - inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_INACTIVE_ANON)); - active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_ACTIVE_ANON)); + inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); + active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1218,55 +1528,12 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) -{ - unsigned long active; - unsigned long inactive; - int zid = zone_idx(zone); - int nid = zone_to_nid(zone); - - inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_INACTIVE_FILE)); - active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, - BIT(LRU_ACTIVE_FILE)); - - return (active > inactive); -} - -struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, - struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - return &mz->reclaim_stat; -} - -struct zone_reclaim_stat * -mem_cgroup_get_reclaim_stat_from_page(struct page *page) -{ - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - - if (mem_cgroup_disabled()) - return NULL; - - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) - return NULL; - /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ - smp_rmb(); - mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); - return &mz->reclaim_stat; -} - #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup - * @mem: the memory cgroup + * @memcg: the memory cgroup * * Returns the maximum amount of memory @mem can be charged with, in * pages. @@ -1283,10 +1550,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; - /* root ? */ - if (cgrp->parent == NULL) + if (!css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; @@ -1403,8 +1668,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, spin_unlock_irqrestore(&memcg->move_lock, *flags); } +#define K(x) ((x) << (PAGE_SHIFT-10)) /** - * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -1413,60 +1679,57 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - /* - * Need a buffer in BSS, can't rely on allocations. The code relies - * on the assumption that OOM is serialized for memory controller. - * If this assumption is broken, revisit this code. - */ - static char memcg_name[PATH_MAX]; - int ret; + /* oom_info_lock ensures that parallel ooms do not interleave */ + static DEFINE_MUTEX(oom_info_lock); + struct mem_cgroup *iter; + unsigned int i; - if (!memcg || !p) + if (!p) return; + mutex_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - printk(KERN_INFO "Task in %s killed", memcg_name); - - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - /* - * Continues from above, so we don't need an KERN_ level - */ - printk(KERN_CONT " as a result of limit of %s\n", memcg_name); -done: - - printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->res, RES_FAILCNT)); - printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " - "failcnt %llu\n", + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); + + for_each_mem_cgroup_tree(iter, memcg) { + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); + pr_cont(":"); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + K(mem_cgroup_read_stat(iter, i))); + } + + for (i = 0; i < NR_LRU_LISTS; i++) + pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], + K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + + pr_cont("\n"); + } + mutex_unlock(&oom_info_lock); } /* @@ -1486,20 +1749,100 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { u64 limit; - u64 memsw; limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* - * If memsw is finite and limits the amount of swap space available - * to this memcg, return that limit. + * Do not consider swap space if we cannot swap due to swappiness */ - return min(limit, memsw); + if (mem_cgroup_swappiness(memcg)) { + u64 memsw; + + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + + /* + * If memsw is finite and limits the amount of swap space + * available to this memcg, return that limit. + */ + limit = min(limit, memsw); + } + + return limit; +} + +static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) +{ + struct mem_cgroup *iter; + unsigned long chosen_points = 0; + unsigned long totalpages; + unsigned int points = 0; + struct task_struct *chosen = NULL; + + /* + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. + */ + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { + set_thread_flag(TIF_MEMDIE); + return; + } + + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { + switch (oom_scan_process_thread(task, totalpages, NULL, + false)) { + case OOM_SCAN_SELECT: + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = ULONG_MAX; + get_task_struct(chosen); + /* fall through */ + case OOM_SCAN_CONTINUE: + continue; + case OOM_SCAN_ABORT: + css_task_iter_end(&it); + mem_cgroup_iter_break(memcg, iter); + if (chosen) + put_task_struct(chosen); + return; + case OOM_SCAN_OK: + break; + }; + points = oom_badness(task, memcg, NULL, totalpages); + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && + thread_group_leader(chosen)) + continue; + + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = points; + get_task_struct(chosen); + } + css_task_iter_end(&it); + } + + if (!chosen) + return; + points = chosen_points * 1000 / totalpages; + oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, + NULL, "Memory cgroup out of memory"); } static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, @@ -1540,7 +1883,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, /** * test_mem_cgroup_node_reclaimable - * @mem: the target memcg + * @memcg: the target memcg * @nid: the node ID to be checked. * @noswap : specify true here if the user wants flle only information. * @@ -1581,9 +1924,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) return; /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_MEMORY]; - for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + for_each_node_mask(nid, node_states[N_MEMORY]) { if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) node_clear(nid, memcg->scan_nodes); @@ -1634,7 +1977,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) * unused nodes. But scan_nodes is lazily updated and may not cotain * enough new information. We need to do double check. */ -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { int nid; @@ -1654,7 +1997,7 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) /* * Check rest of nodes. */ - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { if (node_isset(nid, memcg->scan_nodes)) continue; if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) @@ -1669,7 +2012,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return 0; } -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) +static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); } @@ -1728,15 +2071,24 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, return total; } +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + +static DEFINE_SPINLOCK(memcg_oom_lock); + /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* @@ -1750,33 +2102,35 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) iter->oom_lock = true; } - if (!failed) - return true; - - /* - * OK, we failed to lock the whole subtree so we have to clean up - * what we set up to the failing subtree - */ - for_each_mem_cgroup_tree(iter, memcg) { - if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); - break; + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; } - iter->oom_lock = false; - } - return false; + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); + + spin_unlock(&memcg_oom_lock); + + return !failed; } -/* - * Has to be called with memcg_oom_lock - */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; - return 0; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) @@ -1800,7 +2154,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -1830,6 +2183,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_wakeup_oom(struct mem_cgroup *memcg) { + atomic_inc(&memcg->oom_wakeups); /* for filtering, pass "memcg" as argument. */ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -1840,56 +2194,97 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer. returns false if we should exit memory-reclaim loop. +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +{ + if (!current->memcg_oom.may_oom) + return; + /* + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. + */ + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state + * + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. + * + * Memcg supports userspace OOM handling where failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to complete the OOM handling. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * completed, %false otherwise. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - bool locked, need_to_kill; + bool locked; + + /* OOM is global, do not handle */ + if (!memcg) + return false; + + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); - need_to_kill = true; - mem_cgroup_mark_under_oom(memcg); - /* At first, try to OOM lock hierarchy under memcg.*/ - spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(memcg); - /* - * Even if signal_pending(), we can't quit charge() loop without - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL - * under OOM is always welcomed, use TASK_KILLABLE here. - */ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || memcg->oom_kill_disable) - need_to_kill = false; + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + if (locked) mem_cgroup_oom_notify(memcg); - spin_unlock(&memcg_oom_lock); - if (need_to_kill) { + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, mask, order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); } else { schedule(); + mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); } - spin_lock(&memcg_oom_lock); - if (locked) - mem_cgroup_oom_unlock(memcg); - memcg_wakeup_oom(memcg); - spin_unlock(&memcg_oom_lock); - mem_cgroup_unmark_under_oom(memcg); - - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - return false; - /* Give chance to dying process */ - schedule_timeout_uninterruptible(1); + if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); + } +cleanup: + current->memcg_oom.memcg = NULL; + css_put(&memcg->css); return true; } @@ -1930,7 +2325,7 @@ again: return; /* * If this memory cgroup is not under account moving, we don't - * need to take move_lock_page_cgroup(). Because we already hold + * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until * rcu_read_unlock() if mem_cgroup_stolen() == true. */ @@ -1952,13 +2347,13 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) /* * It's guaranteed that pc->mem_cgroup never changes while * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_page_cgroup(). + * should take move_lock_mem_cgroup(). */ move_unlock_mem_cgroup(pc->mem_cgroup, flags); } void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx, int val) + enum mem_cgroup_stat_index idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1967,18 +2362,11 @@ void mem_cgroup_update_page_stat(struct page *page, if (mem_cgroup_disabled()) return; + VM_BUG_ON(!rcu_read_lock_held()); memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) return; - switch (idx) { - case MEMCG_NR_FILE_MAPPED: - idx = MEM_CGROUP_STAT_FILE_MAPPED; - break; - default: - BUG(); - } - this_cpu_add(memcg->stat->count[idx], val); } @@ -1992,25 +2380,33 @@ struct memcg_stock_pcp { unsigned int nr_pages; struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE (0) +#define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); -/* - * Try to consume stocked charge on this cpu. If success, one page is consumed - * from local stock and true is returned. If the stock is 0 or charges from a - * cgroup which is not current target, returns false. This stock will be - * refilled. +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; bool ret = true; + if (nr_pages > CHARGE_BATCH) + return false; + stock = &get_cpu_var(memcg_stock); - if (memcg == stock->cached && stock->nr_pages) - stock->nr_pages--; + if (memcg == stock->cached && stock->nr_pages >= nr_pages) + stock->nr_pages -= nr_pages; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -2046,6 +2442,17 @@ static void drain_local_stock(struct work_struct *dummy) clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } +static void __init memcg_stock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock = + &per_cpu(memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + } +} + /* * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. @@ -2101,7 +2508,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) flush_work(&stock->work); } out: - put_online_cpus(); + put_online_cpus(); } /* @@ -2139,7 +2546,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) int i; spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { long x = per_cpu(memcg->stat->count[i], cpu); per_cpu(memcg->stat->count[i], cpu) = 0; @@ -2154,7 +2561,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) spin_unlock(&memcg->pcp_counter_lock); } -static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, +static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { @@ -2177,17 +2584,17 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */ enum { CHARGE_OK, /* success */ CHARGE_RETRY, /* need to retry but retry is not bad */ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, bool oom_check) + unsigned int nr_pages, unsigned int min_pages, + bool invoke_oom) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2210,18 +2617,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); /* - * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch - * of regular pages (CHARGE_BATCH), or a single regular page (1). - * * Never reclaim on behalf of optional batching, retry with a * single page instead. */ - if (nr_pages == CHARGE_BATCH) + if (nr_pages > min_pages) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; + if (gfp_mask & __GFP_NORETRY) + return CHARGE_NOMEM; + ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; @@ -2234,7 +2641,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages == 1 && ret) + if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) return CHARGE_RETRY; /* @@ -2244,171 +2651,116 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; - /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) - return CHARGE_NOMEM; - /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) - return CHARGE_OOM_DIE; + if (invoke_oom) + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - return CHARGE_RETRY; + return CHARGE_NOMEM; } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL; int ret; + if (mem_cgroup_is_root(memcg)) + goto done; /* - * Unlike gloval-vm's OOM-kill, we're not in memory shortage - * in system level. So, allow to go ahead dying process in addition to - * MEMDIE process. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) - || fatal_signal_pending(current))) + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current))) goto bypass; - /* - * We always charge the cgroup the mm_struct belongs to. - * The mm_struct's mem_cgroup changes on task migration if the - * thread group leader migrates. It's possible that mm is not - * set, if so charge the init_mm (happens for pagecache usage). - */ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; -again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - VM_BUG_ON(css_is_removed(&memcg->css)); - if (mem_cgroup_is_root(memcg)) - goto done; - if (nr_pages == 1 && consume_stock(memcg)) - goto done; - css_get(&memcg->css); - } else { - struct task_struct *p; + if (unlikely(task_in_memcg_oom(current))) + goto nomem; - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "memcg" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. - */ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (nr_pages == 1 && consume_stock(memcg)) { - /* - * It seems dagerous to access memcg without css_get(). - * But considering how consume_stok works, it's not - * necessary. If consume_stock success, some charges - * from this memcg are cached on this cpu. So, we - * don't need to call css_get()/css_tryget() before - * calling consume_stock(). - */ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&memcg->css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - } + if (gfp_mask & __GFP_NOFAIL) + oom = false; +again: + if (consume_stock(memcg, nr_pages)) + goto done; do { - bool oom_check; + bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ - if (fatal_signal_pending(current)) { - css_put(&memcg->css); + if (fatal_signal_pending(current)) goto bypass; - } - oom_check = false; - if (oom && !nr_oom_retries) { - oom_check = true; - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - } - - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, + nr_pages, invoke_oom); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&memcg->css); - memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom) { - css_put(&memcg->css); + if (!oom || invoke_oom) goto nomem; - } - /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&memcg->css); - goto bypass; } } while (ret != CHARGE_OK); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - css_put(&memcg->css); done: - *ptr = memcg; return 0; nomem: - *ptr = NULL; - return -ENOMEM; + if (!(gfp_mask & __GFP_NOFAIL)) + return -ENOMEM; bypass: - *ptr = root_mem_cgroup; return -EINTR; } +/** + * mem_cgroup_try_charge_mm - try charging a mm + * @mm: mm_struct to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails + * + * Returns the charged mem_cgroup associated with the given mm_struct or + * NULL the charge failed. + */ +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) + +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + css_put(&memcg->css); + if (ret == -EINTR) + memcg = root_mem_cgroup; + else if (ret) + memcg = NULL; + + return memcg; +} + /* * Somemtimes we have to undo a charge we got by try_charge(). * This function is for that and do uncharge, put css's refcnt. @@ -2427,22 +2779,35 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, } /* + * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. + * This is useful when moving usage to parent cgroup. + */ +static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ + unsigned long bytes = nr_pages * PAGE_SIZE; + + if (mem_cgroup_is_root(memcg)) + return; + + res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); + if (do_swap_account) + res_counter_uncharge_until(&memcg->memsw, + memcg->memsw.parent, bytes); +} + +/* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller must check css_is_removed() or some if - * it's concern. (dropping refcnt from swap can be called against removed - * memcg.) + * rcu_read_lock(). The caller is responsible for calling css_tryget if + * the mem_cgroup is used for charging. (dropping refcnt from swap can be + * called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { - struct cgroup_subsys_state *css; - /* ID 0 is unused ID */ if (!id) return NULL; - css = css_lookup(&mem_cgroup_subsys, id); - if (!css) - return NULL; - return container_of(css, struct mem_cgroup, css); + return mem_cgroup_from_id(id); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -2452,7 +2817,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) unsigned short id; swp_entry_t ent; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); lock_page_cgroup(pc); @@ -2481,15 +2846,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); + struct lruvec *lruvec; bool was_on_lru = false; bool anon; lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - __mem_cgroup_cancel_charge(memcg, nr_pages); - return; - } + VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. @@ -2503,8 +2865,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, zone = page_zone(page); spin_lock_irq(&zone->lru_lock); if (PageLRU(page)) { + lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_lru(page)); + del_page_from_lru_list(page, lruvec, page_lru(page)); was_on_lru = true; } } @@ -2516,25 +2879,26 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * is accessed after testing USED bit. To make pc->mem_cgroup visible * before USED bit, we need memory barrier here. * See mem_cgroup_add_lru_list(), etc. - */ + */ smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { if (was_on_lru) { - VM_BUG_ON(PageLRU(page)); + lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); - add_page_to_lru_list(zone, page, page_lru(page)); + add_page_to_lru_list(page, lruvec, page_lru(page)); } spin_unlock_irq(&zone->lru_lock); } - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) anon = true; else anon = false; - mem_cgroup_charge_statistics(memcg, anon, nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); unlock_page_cgroup(pc); /* @@ -2545,9 +2909,736 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, memcg_check_events(memcg, page); } +static DEFINE_MUTEX(set_limit_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static DEFINE_MUTEX(activate_kmem_mutex); + +static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) +{ + return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && + memcg_kmem_is_active(memcg); +} + +/* + * This is a bit cumbersome, but it is rarely used and avoids a backpointer + * in the memcg_cache_params struct. + */ +static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) +{ + struct kmem_cache *cachep; + + VM_BUG_ON(p->is_root_cache); + cachep = p->root_cache; + return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); +} + +#ifdef CONFIG_SLABINFO +static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct memcg_cache_params *params; + + if (!memcg_can_account_kmem(memcg)) + return -EIO; + + print_slabinfo_header(m); + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) + cache_show(memcg_params_to_cache(params), m); + mutex_unlock(&memcg->slab_caches_mutex); + + return 0; +} +#endif + +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +{ + struct res_counter *fail_res; + int ret = 0; + + ret = res_counter_charge(&memcg->kmem, size, &fail_res); + if (ret) + return ret; + + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, + oom_gfp_allowed(gfp)); + if (ret == -EINTR) { + /* + * mem_cgroup_try_charge() chosed to bypass to root due to + * OOM kill or fatal signal. Since our only options are to + * either fail the allocation or charge it to this cgroup, do + * it as a temporary condition. But we can't fail. From a + * kmem/slab perspective, the cache has already been selected, + * by mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed during + * mem_cgroup_try_charge() above. Tasks that were already + * dying when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + res_counter_charge_nofail(&memcg->res, size, &fail_res); + if (do_swap_account) + res_counter_charge_nofail(&memcg->memsw, size, + &fail_res); + ret = 0; + } else if (ret) + res_counter_uncharge(&memcg->kmem, size); + + return ret; +} + +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +{ + res_counter_uncharge(&memcg->res, size); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, size); + + /* Not down to 0 */ + if (res_counter_uncharge(&memcg->kmem, size)) + return; + + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ + if (memcg_kmem_test_and_clear_dead(memcg)) + css_put(&memcg->css); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +static size_t memcg_caches_array_size(int num_groups) +{ + ssize_t size; + if (num_groups <= 0) + return 0; + + size = 2 * num_groups; + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + return size; +} + +/* + * We should update the current array size iff all caches updates succeed. This + * can only be done from the slab side. The slab mutex needs to be held when + * calling this. + */ +void memcg_update_array_size(int num) +{ + if (num > memcg_limited_groups_array_size) + memcg_limited_groups_array_size = memcg_caches_array_size(num); +} + +static void kmem_cache_destroy_work_func(struct work_struct *w); + +int memcg_update_cache_size(struct kmem_cache *s, int num_groups) +{ + struct memcg_cache_params *cur_params = s->memcg_params; + + VM_BUG_ON(!is_root_cache(s)); + + if (num_groups > memcg_limited_groups_array_size) { + int i; + struct memcg_cache_params *new_params; + ssize_t size = memcg_caches_array_size(num_groups); + + size *= sizeof(void *); + size += offsetof(struct memcg_cache_params, memcg_caches); + + new_params = kzalloc(size, GFP_KERNEL); + if (!new_params) + return -ENOMEM; + + new_params->is_root_cache = true; + + /* + * There is the chance it will be bigger than + * memcg_limited_groups_array_size, if we failed an allocation + * in a cache, in which case all caches updated before it, will + * have a bigger array. + * + * But if that is the case, the data after + * memcg_limited_groups_array_size is certainly unused + */ + for (i = 0; i < memcg_limited_groups_array_size; i++) { + if (!cur_params->memcg_caches[i]) + continue; + new_params->memcg_caches[i] = + cur_params->memcg_caches[i]; + } + + /* + * Ideally, we would wait until all caches succeed, and only + * then free the old one. But this is not worth the extra + * pointer per-cache we'd have to have for this. + * + * It is not a big deal if some caches are left with a size + * bigger than the others. And all updates will reset this + * anyway. + */ + rcu_assign_pointer(s->memcg_params, new_params); + if (cur_params) + kfree_rcu(cur_params, rcu_head); + } + return 0; +} + +char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + static char *buf = NULL; + + /* + * We need a mutex here to protect the shared buffer. Since this is + * expected to be called only on cache creation, we can employ the + * slab_mutex for that purpose. + */ + lockdep_assert_held(&slab_mutex); + + if (!buf) { + buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!buf) + return NULL; + } + + cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); + return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), buf); +} + +int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, + struct kmem_cache *root_cache) +{ + size_t size; + + if (!memcg_kmem_enabled()) + return 0; + + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); + size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); + + s->memcg_params = kzalloc(size, GFP_KERNEL); + if (!s->memcg_params) + return -ENOMEM; + + if (memcg) { + s->memcg_params->memcg = memcg; + s->memcg_params->root_cache = root_cache; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); + css_get(&memcg->css); + } else + s->memcg_params->is_root_cache = true; + + return 0; +} + +void memcg_free_cache_params(struct kmem_cache *s) +{ + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + css_put(&s->memcg_params->memcg->css); + kfree(s->memcg_params); +} + +void memcg_register_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + if (is_root_cache(s)) + return; + + /* + * Holding the slab_mutex assures nobody will touch the memcg_caches + * array while we are modifying it. + */ + lockdep_assert_held(&slab_mutex); + + root = s->memcg_params->root_cache; + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + + /* + * Initialize the pointer to this cache in its parent's memcg_params + * before adding it to the memcg_slab_caches list, otherwise we can + * fail to convert memcg_params_to_cache() while traversing the list. + */ + VM_BUG_ON(root->memcg_params->memcg_caches[id]); + root->memcg_params->memcg_caches[id] = s; + + mutex_lock(&memcg->slab_caches_mutex); + list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); + mutex_unlock(&memcg->slab_caches_mutex); +} + +void memcg_unregister_cache(struct kmem_cache *s) +{ + struct kmem_cache *root; + struct mem_cgroup *memcg; + int id; + + if (is_root_cache(s)) + return; + + /* + * Holding the slab_mutex assures nobody will touch the memcg_caches + * array while we are modifying it. + */ + lockdep_assert_held(&slab_mutex); + + root = s->memcg_params->root_cache; + memcg = s->memcg_params->memcg; + id = memcg_cache_id(memcg); + + mutex_lock(&memcg->slab_caches_mutex); + list_del(&s->memcg_params->list); + mutex_unlock(&memcg->slab_caches_mutex); + + /* + * Clear the pointer to this cache in its parent's memcg_params only + * after removing it from the memcg_slab_caches list, otherwise we can + * fail to convert memcg_params_to_cache() while traversing the list. + */ + VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); + root->memcg_params->memcg_caches[id] = NULL; +} + +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + +static void kmem_cache_destroy_work_func(struct work_struct *w) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *p; + + p = container_of(w, struct memcg_cache_params, destroy); + + cachep = memcg_params_to_cache(p); + + /* + * If we get down to 0 after shrink, we could delete right away. + * However, memcg_release_pages() already puts us back in the workqueue + * in that case. If we proceed deleting, we'll get a dangling + * reference, and removing the object from the workqueue in that case + * is unnecessary complication. We are not a fast path. + * + * Note that this case is fundamentally different from racing with + * shrink_slab(): if memcg_cgroup_destroy_cache() is called in + * kmem_cache_shrink, not only we would be reinserting a dead cache + * into the queue, but doing so from inside the worker racing to + * destroy it. + * + * So if we aren't down to zero, we'll just schedule a worker and try + * again + */ + if (atomic_read(&cachep->memcg_params->nr_pages) != 0) + kmem_cache_shrink(cachep); + else + kmem_cache_destroy(cachep); +} + +void mem_cgroup_destroy_cache(struct kmem_cache *cachep) +{ + if (!cachep->memcg_params->dead) + return; + + /* + * There are many ways in which we can get here. + * + * We can get to a memory-pressure situation while the delayed work is + * still pending to run. The vmscan shrinkers can then release all + * cache memory and get us to destruction. If this is the case, we'll + * be executed twice, which is a bug (the second time will execute over + * bogus data). In this case, cancelling the work should be fine. + * + * But we can also get here from the worker itself, if + * kmem_cache_shrink is enough to shake all the remaining objects and + * get the page count to 0. In this case, we'll deadlock if we try to + * cancel the work (the worker runs with an internal lock held, which + * is the same lock we would hold for cancel_work_sync().) + * + * Since we can't possibly know who got us here, just refrain from + * running if there is already work pending + */ + if (work_pending(&cachep->memcg_params->destroy)) + return; + /* + * We have to defer the actual destroying to a workqueue, because + * we might currently be in a context that cannot sleep. + */ + schedule_work(&cachep->memcg_params->destroy); +} + +int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + struct kmem_cache *c; + int i, failed = 0; + + /* + * If the cache is being destroyed, we trust that there is no one else + * requesting objects from it. Even if there are, the sanity checks in + * kmem_cache_destroy should caught this ill-case. + * + * Still, we don't want anyone else freeing memcg_caches under our + * noses, which can happen if a new memcg comes to life. As usual, + * we'll take the activate_kmem_mutex to protect ourselves against + * this. + */ + mutex_lock(&activate_kmem_mutex); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + + /* + * We will now manually delete the caches, so to avoid races + * we need to cancel all pending destruction workers and + * proceed with destruction ourselves. + * + * kmem_cache_destroy() will call kmem_cache_shrink internally, + * and that could spawn the workers again: it is likely that + * the cache still have active pages until this very moment. + * This would lead us back to mem_cgroup_destroy_cache. + * + * But that will not execute at all if the "dead" flag is not + * set, so flip it down to guarantee we are in control. + */ + c->memcg_params->dead = false; + cancel_work_sync(&c->memcg_params->destroy); + kmem_cache_destroy(c); + + if (cache_from_memcg_idx(s, i)) + failed++; + } + mutex_unlock(&activate_kmem_mutex); + return failed; +} + +static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!memcg_kmem_is_active(memcg)) + return; + + mutex_lock(&memcg->slab_caches_mutex); + list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + cachep = memcg_params_to_cache(params); + cachep->memcg_params->dead = true; + schedule_work(&cachep->memcg_params->destroy); + } + mutex_unlock(&memcg->slab_caches_mutex); +} + +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_create_cache_work_func(struct work_struct *w) +{ + struct create_work *cw = container_of(w, struct create_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; + + kmem_cache_create_memcg(memcg, cachep); + css_put(&memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + */ +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct create_work *cw; + + cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + if (cw == NULL) { + css_put(&memcg->css); + return; + } + + cw->memcg = memcg; + cw->cachep = cachep; + + INIT_WORK(&cw->work, memcg_create_cache_work_func); + schedule_work(&cw->work); +} + +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, + gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct kmem_cache *memcg_cachep; + + VM_BUG_ON(!cachep->memcg_params); + VM_BUG_ON(!cachep->memcg_params->is_root_cache); + + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); + + if (!memcg_can_account_kmem(memcg)) + goto out; + + memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + if (likely(memcg_cachep)) { + cachep = memcg_cachep; + goto out; + } + + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) + goto out; + rcu_read_unlock(); + + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; +out: + rcu_read_unlock(); + return cachep; +} +EXPORT_SYMBOL(__memcg_kmem_get_cache); + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc(<large_number>) + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + + memcg = get_mem_cgroup_from_mm(current->mm); + + if (!memcg_can_account_kmem(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + struct page_cgroup *pc; + + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); + return; + } + + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + pc->mem_cgroup = memcg; + SetPageCgroupUsed(pc); + unlock_page_cgroup(pc); +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + /* + * Fast unlocked return. Theoretically might have changed, have to + * check again after locking. + */ + if (!PageCgroupUsed(pc)) + return; + + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + ClearPageCgroupUsed(pc); + } + unlock_page_cgroup(pc); + + /* + * We trust that only if there is a memcg associated with the page, it + * is a valid allocation + */ + if (!memcg) + return; + + VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); + memcg_uncharge_kmem(memcg, PAGE_SIZE << order); +} +#else +static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) +#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2558,16 +3649,21 @@ void mem_cgroup_split_huge_fixup(struct page *head) { struct page_cgroup *head_pc = lookup_page_cgroup(head); struct page_cgroup *pc; + struct mem_cgroup *memcg; int i; if (mem_cgroup_disabled()) return; + + memcg = head_pc->mem_cgroup; for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; - pc->mem_cgroup = head_pc->mem_cgroup; + pc->mem_cgroup = memcg; smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -2578,30 +3674,26 @@ void mem_cgroup_split_huge_fixup(struct page *head) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. - * @uncharge: whether we should call uncharge and css_put against @from. * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) * - compound_lock is held when nr_pages > 1 * - * This function doesn't do "charge" nor css_get to new cgroup. It should be - * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is - * true, this function does "uncharge" from old cgroup, but it doesn't if - * @uncharge is false, so a caller should do "uncharge". + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, unsigned int nr_pages, struct page_cgroup *pc, struct mem_cgroup *from, - struct mem_cgroup *to, - bool uncharge) + struct mem_cgroup *to) { unsigned long flags; int ret; bool anon = PageAnon(page); VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); /* * The page is isolated from LRU. So, collapse function * will not handle this page. But page splitting can happen. @@ -2621,27 +3713,24 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); if (!anon && page_mapped(page)) { - /* Update mapped_file data for mem_cgroup */ - preempt_disable(); - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - preempt_enable(); + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); } - mem_cgroup_charge_statistics(from, anon, -nr_pages); - if (uncharge) - /* This is not "cancel", but cancel_charge does all we need. */ - __mem_cgroup_cancel_charge(from, nr_pages); + + mem_cgroup_charge_statistics(from, page, anon, -nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, anon, nr_pages); - /* - * We charges against "to" which may not have any tasks. Then, "to" - * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and move charge, so it's - * guaranteed that "to" is never removed. So, we don't check rmdir - * status here. - */ + mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -2655,25 +3744,37 @@ out: return ret; } -/* - * move charges to its parent. +/** + * mem_cgroup_move_parent - moves page to the parent group + * @page: the page to move + * @pc: page_cgroup of the page + * @child: page's cgroup + * + * move charges to its parent or the root cgroup if the group has no + * parent (aka use_hierarchy==0). + * Although this might fail (get_page_unless_zero, isolate_lru_page or + * mem_cgroup_move_account fails) the failure is always temporary and + * it signals a race with a page removal/uncharge or migration. In the + * first case the page is on the way out and it will vanish from the LRU + * on the next attempt and the call should be retried later. + * Isolation from the LRU fails only if page has been isolated from + * the LRU since we looked at it and that usually means either global + * reclaim or migration going on. The page will either get back to the + * LRU or vanish. + * Finaly mem_cgroup_move_account fails only if the page got uncharged + * (!PageCgroupUsed) or moved to a different group. The page will + * disappear in the next attempt. */ - static int mem_cgroup_move_parent(struct page *page, struct page_cgroup *pc, - struct mem_cgroup *child, - gfp_t gfp_mask) + struct mem_cgroup *child) { - struct cgroup *cg = child->css.cgroup; - struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; unsigned int nr_pages; unsigned long uninitialized_var(flags); int ret; - /* Is ROOT ? */ - if (!pcg) - return -EINVAL; + VM_BUG_ON(mem_cgroup_is_root(child)); ret = -EBUSY; if (!get_page_unless_zero(page)) @@ -2683,21 +3784,25 @@ static int mem_cgroup_move_parent(struct page *page, nr_pages = hpage_nr_pages(page); - parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); - if (ret) - goto put_back; + parent = parent_mem_cgroup(child); + /* + * If no parent, move charges to root cgroup. + */ + if (!parent) + parent = root_mem_cgroup; - if (nr_pages > 1) + if (nr_pages > 1) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); flags = compound_lock_irqsave(page); + } - ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); - if (ret) - __mem_cgroup_cancel_charge(parent, nr_pages); + ret = mem_cgroup_move_account(page, nr_pages, + pc, child, parent); + if (!ret) + __mem_cgroup_cancel_local_charge(child, nr_pages); if (nr_pages > 1) compound_unlock_irqrestore(page, flags); -put_back: putback_lru_page(page); put: put_page(page); @@ -2705,23 +3810,23 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_charge_anon(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; + struct mem_cgroup *memcg; bool oom = true; - int ret; + + if (mem_cgroup_disabled()) + return 0; + + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + VM_BUG_ON(!mm); if (PageTransHuge(page)) { nr_pages <<= compound_order(page); - VM_BUG_ON(!PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* * Never OOM-kill a process for a huge page. The * fault handler will fall back to regular pages. @@ -2729,100 +3834,86 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, oom = false; } - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret == -ENOMEM) - return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + if (!memcg) + return -ENOMEM; + __mem_cgroup_commit_charge(memcg, page, nr_pages, + MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; } -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON(page_mapped(page)); - VM_BUG_ON(page->mapping && !PageAnon(page)); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, - enum charge_type ctype); - -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *memcg = NULL; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (unlikely(!mm)) - mm = &init_mm; - if (!page_is_file_cache(page)) - type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); - } - return ret; -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without * struct page_cgroup is acquired. This refcnt will be consumed by * "commit()" or removed by "cancel()" */ -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, struct mem_cgroup **memcgp) +static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, + struct page *page, + gfp_t mask, + struct mem_cgroup **memcgp) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; + struct page_cgroup *pc; int ret; - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return 0; - - if (!do_swap_account) - goto charge_cur_mm; + pc = lookup_page_cgroup(page); /* - * A racing thread's fault, or swapoff, may have already updated - * the pte, and even removed page from swap cache: in those cases - * do_swap_page()'s pte_same() test will fail; but there's also a - * KSM case which does need to charge the page. + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. */ - if (!PageSwapCache(page)) - goto charge_cur_mm; - memcg = try_get_mem_cgroup_from_page(page); + if (PageCgroupUsed(pc)) + goto out; + if (do_swap_account) + memcg = try_get_mem_cgroup_from_page(page); if (!memcg) - goto charge_cur_mm; - *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, mask, 1, true); css_put(&memcg->css); if (ret == -EINTR) - ret = 0; - return ret; -charge_cur_mm: - if (unlikely(!mm)) - mm = &init_mm; - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = root_mem_cgroup; + else if (ret) + return ret; +out: + *memcgp = memcg; + return 0; +} + +int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + if (mem_cgroup_disabled()) { + *memcgp = NULL; + return 0; + } + /* + * A racing thread's fault, or swapoff, may have already + * updated the pte, and even removed page from swap cache: in + * those cases unuse_pte()'s pte_same() test will fail; but + * there's also a KSM case which does need to charge the page. + */ + if (!PageSwapCache(page)) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + *memcgp = memcg; + return 0; + } + return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); +} + +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return; + if (!memcg) + return; + __mem_cgroup_cancel_charge(memcg, 1); } static void @@ -2833,7 +3924,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, return; if (!memcg) return; - cgroup_exclude_rmdir(&memcg->css); __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* @@ -2845,47 +3935,43 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; - struct mem_cgroup *swap_memcg; - unsigned short id; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - swap_memcg = mem_cgroup_lookup(id); - if (swap_memcg) { - /* - * This recorded memcg can be obsolete one. So, avoid - * calling css_tryget - */ - if (!mem_cgroup_is_root(swap_memcg)) - res_counter_uncharge(&swap_memcg->memsw, - PAGE_SIZE); - mem_cgroup_swap_statistics(swap_memcg, false); - mem_cgroup_put(swap_memcg); - } - rcu_read_unlock(); + mem_cgroup_uncharge_swap(ent); } - /* - * At swapin, we may charge account against cgroup which has no tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg) { __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_ANON); } -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + struct mem_cgroup *memcg; + int ret; + if (mem_cgroup_disabled()) - return; + return 0; + if (PageCompound(page)) + return 0; + + if (PageSwapCache(page)) { /* shmem */ + ret = __mem_cgroup_try_charge_swapin(mm, page, + gfp_mask, &memcg); + if (ret) + return ret; + __mem_cgroup_commit_charge_swapin(page, memcg, type); + return 0; + } + + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); + return -ENOMEM; + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + return 0; } static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, @@ -2945,7 +4031,8 @@ direct_uncharge: * uncharge if !page_mapped(page) */ static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, + bool end_migration) { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; @@ -2955,12 +4042,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (mem_cgroup_disabled()) return NULL; - if (PageSwapCache(page)) - return NULL; - if (PageTransHuge(page)) { nr_pages <<= compound_order(page); - VM_BUG_ON(!PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); } /* * Check if our page_cgroup is valid @@ -2979,7 +4063,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) anon = PageAnon(page); switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_MAPPED: + case MEM_CGROUP_CHARGE_TYPE_ANON: /* * Generally PageAnon tells if it's the anon statistics to be * updated; but sometimes e.g. mem_cgroup_uncharge_page() is @@ -2989,7 +4073,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) /* fallthrough */ case MEM_CGROUP_CHARGE_TYPE_DROP: /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page) || PageCgroupMigration(pc)) + if (page_mapped(page)) + goto unlock_out; + /* + * Pages under migration may not be uncharged. But + * end_migration() /must/ be the one uncharging the + * unused post-migration page and so it has to call + * here with the migration bit still set. See the + * res_counter handling below. + */ + if (!end_migration && PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -3003,7 +4096,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(memcg, anon, -nr_pages); + mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); ClearPageCgroupUsed(pc); /* @@ -3016,14 +4109,19 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } - if (!mem_cgroup_is_root(memcg)) + /* + * Migration does not charge the res_counter for the + * replacement page, so leave it alone when phasing out the + * page that is unused after the migration. + */ + if (!end_migration && !mem_cgroup_is_root(memcg)) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -3038,15 +4136,29 @@ void mem_cgroup_uncharge_page(struct page *page) /* early check. */ if (page_mapped(page)) return; - VM_BUG_ON(page->mapping && !PageAnon(page)); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + /* + * If the page is in swap cache, uncharge should be deferred + * to the swap path, which also properly accounts swap usage + * and handles memcg lifetime. + * + * Note that this check is not stable and reclaim may add the + * page to swap cache at any time after this. However, if the + * page is not in swap cache by the time page->mapcount hits + * 0, there won't be any page table references to the swap + * slot, and reclaim will free it and not actually write the + * page to disk. + */ + if (PageSwapCache(page)) + return; + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); } void mem_cgroup_uncharge_cache_page(struct page *page) { - VM_BUG_ON(page_mapped(page)); - VM_BUG_ON(page->mapping); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping, page); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); } /* @@ -3110,18 +4222,18 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) if (!swapout) /* this was a swap cache but the swap is unused ! */ ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - memcg = __mem_cgroup_uncharge_common(page, ctype); + memcg = __mem_cgroup_uncharge_common(page, ctype, false); /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, css_id(&memcg->css)); + swap_cgroup_record(ent, mem_cgroup_id(memcg)); } #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP /* * called from swap_entry_free(). remove record in swap_cgroup and * uncharge "memsw" account. @@ -3145,7 +4257,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -3155,7 +4267,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * @entry: swap entry to be moved * @from: mem_cgroup which the entry is moved from * @to: mem_cgroup which the entry is moved to - * @need_fixup: whether we should fixup res_counters and refcounts. * * It succeeds only when the swap_cgroup's record for this entry is the same * as the mem_cgroup's id of @from. @@ -3166,12 +4277,12 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * both res and memsw, and called css_get(). */ static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { unsigned short old_id, new_id; - old_id = css_id(&from->css); - new_id = css_id(&to->css); + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -3180,29 +4291,21 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); - if (need_fixup) { - if (!mem_cgroup_is_root(from)) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - mem_cgroup_put(from); - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - if (!mem_cgroup_is_root(to)) - res_counter_uncharge(&to->res, PAGE_SIZE); - } + css_get(&to->css); return 0; } return -EINVAL; } #else static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) + struct mem_cgroup *from, struct mem_cgroup *to) { return -EINVAL; } @@ -3212,19 +4315,21 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) +void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, + struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; - int ret = 0; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) - return 0; + return; + + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); pc = lookup_page_cgroup(page); lock_page_cgroup(pc); @@ -3269,24 +4374,9 @@ int mem_cgroup_prepare_migration(struct page *page, * we return here. */ if (!memcg) - return 0; + return; *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); - css_put(&memcg->css);/* drop extra refcnt */ - if (ret) { - if (PageAnon(page)) { - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - /* - * The old page may be fully unmapped while we kept it. - */ - mem_cgroup_uncharge_page(page); - } - /* we'll need to revisit this error code (we have -EINTR) */ - return -ENOMEM; - } /* * We charge new page before it's used/mapped. So, even if unlock_page() * is called before end_migration, we can catch all events on this new @@ -3294,13 +4384,15 @@ int mem_cgroup_prepare_migration(struct page *page, * mapcount will be finally 0 and we call uncharge in end_migration(). */ if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; - else if (page_is_file_cache(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + ctype = MEM_CGROUP_CHARGE_TYPE_ANON; else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); - return ret; + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + /* + * The page is committed to the memcg, but it's not actually + * charged to the res_counter since we plan on replacing the + * old one and only one page is going to be left afterwards. + */ + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ @@ -3313,8 +4405,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, if (!memcg) return; - /* blocks rmdir() */ - cgroup_exclude_rmdir(&memcg->css); + if (!migration_ok) { used = oldpage; unused = newpage; @@ -3322,6 +4413,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, used = newpage; unused = oldpage; } + anon = PageAnon(used); + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_ANON + : MEM_CGROUP_CHARGE_TYPE_CACHE, + true); + css_put(&memcg->css); /* * We disallowed uncharge of pages under migration because mapcount * of the page goes down to zero, temporarly. @@ -3331,10 +4428,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, lock_page_cgroup(pc); ClearPageCgroupMigration(pc); unlock_page_cgroup(pc); - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED - : MEM_CGROUP_CHARGE_TYPE_CACHE); /* * If a page is a file cache, radix-tree replacement is very atomic @@ -3346,13 +4439,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, */ if (anon) mem_cgroup_uncharge_page(used); - /* - * At migration, we may charge account against cgroup which has no - * tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } /* @@ -3363,7 +4449,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; @@ -3373,14 +4459,19 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, pc = lookup_page_cgroup(oldpage); /* fix accounting on old pages */ lock_page_cgroup(pc); - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, false, -1); - ClearPageCgroupUsed(pc); + if (PageCgroupUsed(pc)) { + memcg = pc->mem_cgroup; + mem_cgroup_charge_statistics(memcg, oldpage, false, -1); + ClearPageCgroupUsed(pc); + } unlock_page_cgroup(pc); - if (PageSwapBacked(oldpage)) - type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - + /* + * When called from shmem_replace_page(), in some cases the + * oldpage has already been charged, and in some cases not. + */ + if (!memcg) + return; /* * Even if newpage->mapping was NULL before starting replacement, * the newpage may be on LRU(or pagevec for LRU) already. We lock @@ -3419,14 +4510,12 @@ void mem_cgroup_print_bad_page(struct page *page) pc = lookup_page_cgroup_used(page); if (pc) { - printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); + pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", + pc, pc->flags, pc->mem_cgroup); } } #endif -static DEFINE_MUTEX(set_limit_mutex); - static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -3455,7 +4544,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit < memcg->memsw.limit. + * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); @@ -3485,7 +4574,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ - if (curusage >= oldusage) + if (curusage >= oldusage) retry_count--; else oldusage = curusage; @@ -3506,7 +4595,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, int enlarge = 0; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { @@ -3516,7 +4605,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit < memcg->memsw.limit. + * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -3647,33 +4736,35 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/* - * This routine traverse page_cgroup in given list and drop them all. - * *And* this routine doesn't reclaim page itself, just removes page_cgroup. +/** + * mem_cgroup_force_empty_list - clears LRU of a group + * @memcg: group to clear + * @node: NUMA node + * @zid: zone id + * @lru: lru to to clear + * + * Traverse a specified page_cgroup list and try to drop them all. This doesn't + * reclaim the pages page themselves - pages are moved to the parent (or root) + * group. */ -static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, +static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { - struct mem_cgroup_per_zone *mz; - unsigned long flags, loop; + struct lruvec *lruvec; + unsigned long flags; struct list_head *list; struct page *busy; struct zone *zone; - int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(memcg, node, zid); - list = &mz->lruvec.lists[lru]; + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + list = &lruvec->lists[lru]; - loop = mz->lru_size[lru]; - /* give some margin against EBUSY etc...*/ - loop += 256; busy = NULL; - while (loop--) { + do { struct page_cgroup *pc; struct page *page; - ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -3690,95 +4781,100 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, pc = lookup_page_cgroup(page); - ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM || ret == -EINTR) - break; - - if (ret == -EBUSY || ret == -EINVAL) { + if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; cond_resched(); } else busy = NULL; - } - - if (!ret && !list_empty(list)) - return -EBUSY; - return ret; + } while (!list_empty(list)); } /* - * make mem_cgroup's charge to be 0 if there is no task. + * make mem_cgroup's charge to be 0 if there is no task by moving + * all the charges and pages to the parent. * This enables deleting this mem_cgroup. + * + * Caller is responsible for holding css reference on the memcg. */ -static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) +static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { - int ret; - int node, zid, shrink; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - css_get(&memcg->css); + int node, zid; + u64 usage; - shrink = 0; - /* should free all ? */ - if (free_all) - goto try_to_free; -move_account: do { - ret = -EBUSY; - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - goto out; - ret = -EINTR; - if (signal_pending(current)) - goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); - ret = 0; mem_cgroup_start_move(memcg); - for_each_node_state(node, N_HIGH_MEMORY) { - for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { + for_each_node_state(node, N_MEMORY) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { enum lru_list lru; for_each_lru(lru) { - ret = mem_cgroup_force_empty_list(memcg, + mem_cgroup_force_empty_list(memcg, node, zid, lru); - if (ret) - break; } } - if (ret) - break; } mem_cgroup_end_move(memcg); memcg_oom_recover(memcg); - /* it seems parent cgroup doesn't have enough mem */ - if (ret == -ENOMEM) - goto try_to_free; cond_resched(); - /* "ret" should also be checked to ensure all lists are empty. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); -out: - css_put(&memcg->css); - return ret; -try_to_free: + /* + * Kernel memory may not necessarily be trackable to a specific + * process. So they are not migrated, and therefore we can't + * expect their value to drop to 0 here. + * Having res filled up with kmem only is enough. + * + * This is a safety check because mem_cgroup_force_empty_list + * could have raced with mem_cgroup_replace_page_cache callers + * so the lru seemed empty but the page could have been added + * right after the check. RES_USAGE should be safe as we always + * charge before adding to the LRU. + */ + usage = res_counter_read_u64(&memcg->res, RES_USAGE) - + res_counter_read_u64(&memcg->kmem, RES_USAGE); + } while (usage > 0); +} + +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + lockdep_assert_held(&memcg_create_mutex); + /* + * The lock does not prevent addition or deletion to the list + * of children, but it prevents a new child from being + * initialized based on this parent in css_online(), so it's + * enough to decide whether hierarchically inherited + * attributes can still be changed or not. + */ + return memcg->use_hierarchy && + !list_empty(&memcg->css.cgroup->children); +} + +/* + * Reclaims as many pages from the given memcg as possible and moves + * the rest to the parent. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct cgroup *cgrp = memcg->css.cgroup; + /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { - ret = -EBUSY; - goto out; - } + if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) + return -EBUSY; + /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - shrink = 1; while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { int progress; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + if (signal_pending(current)) + return -EINTR; + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, false); if (!progress) { @@ -3789,33 +4885,39 @@ try_to_free: } lru_add_drain(); - /* try move_account...there may be some *locked* pages. */ - goto move_account; + mem_cgroup_reparent_charges(memcg); + + return 0; } -int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, + unsigned int event) { - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); -} + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + return mem_cgroup_force_empty(memcg); +} -static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return mem_cgroup_from_cont(cont)->use_hierarchy; + return mem_cgroup_from_css(css)->use_hierarchy; } -static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, - u64 val) +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_memcg = NULL; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + + mutex_lock(&memcg_create_mutex); - if (parent) - parent_memcg = mem_cgroup_from_cont(parent); + if (memcg->use_hierarchy == val) + goto out; - cgroup_lock(); /* * If parent's use_hierarchy is set, we can't make any modifications * in the child subtrees. If it is unset, then the change can @@ -3826,13 +4928,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, */ if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { - if (list_empty(&cont->children)) + if (list_empty(&memcg->css.cgroup->children)) memcg->use_hierarchy = val; else retval = -EBUSY; } else retval = -EINVAL; - cgroup_unlock(); + +out: + mutex_unlock(&memcg_create_mutex); return retval; } @@ -3864,23 +4968,30 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return res_counter_read_u64(&memcg->memsw, RES_USAGE); } + /* + * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS + * as well as in MEM_CGROUP_STAT_RSS_HUGE. + */ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); return val << PAGE_SHIFT; } -static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); u64 val; - int type, name; + int name; + enum res_type type; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); + switch (type) { case _MEM: if (name == RES_USAGE) @@ -3894,25 +5005,159 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) else val = res_counter_read_u64(&memcg->memsw, name); break; + case _KMEM: + val = res_counter_read_u64(&memcg->kmem, name); + break; default: BUG(); } + return val; } + +#ifdef CONFIG_MEMCG_KMEM +/* should be called with activate_kmem_mutex held */ +static int __memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long long limit) +{ + int err = 0; + int memcg_id; + + if (memcg_kmem_is_active(memcg)) + return 0; + + /* + * We are going to allocate memory for data shared by all memory + * cgroups so let's stop accounting here. + */ + memcg_stop_kmem_account(); + + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + */ + mutex_lock(&memcg_create_mutex); + if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) + err = -EBUSY; + mutex_unlock(&memcg_create_mutex); + if (err) + goto out; + + memcg_id = ida_simple_get(&kmem_limited_groups, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (memcg_id < 0) { + err = memcg_id; + goto out; + } + + /* + * Make sure we have enough space for this cgroup in each root cache's + * memcg_params. + */ + err = memcg_update_all_caches(memcg_id + 1); + if (err) + goto out_rmid; + + memcg->kmemcg_id = memcg_id; + INIT_LIST_HEAD(&memcg->memcg_slab_caches); + mutex_init(&memcg->slab_caches_mutex); + + /* + * We couldn't have accounted to this cgroup, because it hasn't got the + * active bit set yet, so this should succeed. + */ + err = res_counter_set_limit(&memcg->kmem, limit); + VM_BUG_ON(err); + + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * Setting the active bit after enabling static branching will + * guarantee no one starts accounting before all call sites are + * patched. + */ + memcg_kmem_set_active(memcg); +out: + memcg_resume_kmem_account(); + return err; + +out_rmid: + ida_simple_remove(&kmem_limited_groups, memcg_id); + goto out; +} + +static int memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long long limit) +{ + int ret; + + mutex_lock(&activate_kmem_mutex); + ret = __memcg_activate_kmem(memcg, limit); + mutex_unlock(&activate_kmem_mutex); + return ret; +} + +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long long val) +{ + int ret; + + if (!memcg_kmem_is_active(memcg)) + ret = memcg_activate_kmem(memcg, val); + else + ret = res_counter_set_limit(&memcg->kmem, val); + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + if (!parent) + return 0; + + mutex_lock(&activate_kmem_mutex); + /* + * If the parent cgroup is not kmem-active now, it cannot be activated + * after this point, because it has at least one child already. + */ + if (memcg_kmem_is_active(parent)) + ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); + mutex_unlock(&activate_kmem_mutex); + return ret; +} +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long long val) +{ + return -EINVAL; +} +#endif /* CONFIG_MEMCG_KMEM */ + /* * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, - const char *buffer) +static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, + char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - int type, name; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + enum res_type type; + int name; unsigned long long val; int ret; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); + switch (name) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ @@ -3925,8 +5170,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, break; if (type == _MEM) ret = mem_cgroup_resize_limit(memcg, val); - else + else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); + else if (type == _KMEM) + ret = memcg_update_kmem_limit(memcg, val); + else + return -EINVAL; break; case RES_SOFT_LIMIT: ret = res_counter_memparse_write_strategy(buffer, &val); @@ -3952,18 +5201,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, unsigned long long *mem_limit, unsigned long long *memsw_limit) { - struct cgroup *cgroup; unsigned long long min_limit, min_memsw_limit, tmp; min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - cgroup = memcg->css.cgroup; if (!memcg->use_hierarchy) goto out; - while (cgroup->parent) { - cgroup = cgroup->parent; - memcg = mem_cgroup_from_cont(cgroup); + while (css_parent(&memcg->css)) { + memcg = mem_cgroup_from_css(css_parent(&memcg->css)); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -3976,299 +5222,243 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { - struct mem_cgroup *memcg; - int type, name; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int name; + enum res_type type; - memcg = mem_cgroup_from_cont(cont); type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); + switch (name) { case RES_MAX_USAGE: if (type == _MEM) res_counter_reset_max(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_max(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_max(&memcg->kmem); + else + return -EINVAL; break; case RES_FAILCNT: if (type == _MEM) res_counter_reset_failcnt(&memcg->res); - else + else if (type == _MEMSWAP) res_counter_reset_failcnt(&memcg->memsw); + else if (type == _KMEM) + res_counter_reset_failcnt(&memcg->kmem); + else + return -EINVAL; break; } return 0; } -static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; + return mem_cgroup_from_css(css)->move_charge_at_immigrate; } #ifdef CONFIG_MMU -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; + /* - * We check this value several times in both in can_attach() and - * attach(), so we need cgroup lock to prevent this value from being - * inconsistent. + * No kind of locking is needed in here, because ->can_attach() will + * check this value once in the beginning of the process, and then carry + * on with stale data. This means that changes to this value will only + * affect task migrations starting after the change. */ - cgroup_lock(); memcg->move_charge_at_immigrate = val; - cgroup_unlock(); - return 0; } #else -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { return -ENOSYS; } #endif - -/* For read statistics */ -enum { - MCS_CACHE, - MCS_RSS, - MCS_FILE_MAPPED, - MCS_PGPGIN, - MCS_PGPGOUT, - MCS_SWAP, - MCS_PGFAULT, - MCS_PGMAJFAULT, - MCS_INACTIVE_ANON, - MCS_ACTIVE_ANON, - MCS_INACTIVE_FILE, - MCS_ACTIVE_FILE, - MCS_UNEVICTABLE, - NR_MCS_STAT, -}; - -struct mcs_total_stat { - s64 stat[NR_MCS_STAT]; -}; - -struct { - char *local_name; - char *total_name; -} memcg_stat_strings[NR_MCS_STAT] = { - {"cache", "total_cache"}, - {"rss", "total_rss"}, - {"mapped_file", "total_mapped_file"}, - {"pgpgin", "total_pgpgin"}, - {"pgpgout", "total_pgpgout"}, - {"swap", "total_swap"}, - {"pgfault", "total_pgfault"}, - {"pgmajfault", "total_pgmajfault"}, - {"inactive_anon", "total_inactive_anon"}, - {"active_anon", "total_active_anon"}, - {"inactive_file", "total_inactive_file"}, - {"active_file", "total_active_file"}, - {"unevictable", "total_unevictable"} -}; - - -static void -mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) -{ - s64 val; - - /* per cpu stat */ - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); - s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); - s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); - s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); - s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); - s->stat[MCS_PGPGOUT] += val; - if (do_swap_account) { - val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); - s->stat[MCS_SWAP] += val * PAGE_SIZE; - } - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); - s->stat[MCS_PGFAULT] += val; - val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); - s->stat[MCS_PGMAJFAULT] += val; - - /* per zone stat */ - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); - s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); - s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); - s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); - s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; -} - -static void -mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) -{ - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - mem_cgroup_get_local_stat(iter, s); -} - #ifdef CONFIG_NUMA -static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +static int memcg_numa_stat_show(struct seq_file *m, void *v) { + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; int nid; - unsigned long total_nr, file_nr, anon_nr, unevictable_nr; - unsigned long node_nr; - struct cgroup *cont = m->private; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - - total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); - seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); - seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_FILE); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); - seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_ANON); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_HIGH_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - BIT(LRU_UNEVICTABLE)); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); + unsigned long nr; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); + seq_printf(m, "%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + struct mem_cgroup *iter; + + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); + seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_node_nr_lru_pages( + iter, nid, stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + return 0; } #endif /* CONFIG_NUMA */ -static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, - struct cgroup_map_cb *cb) +static inline void mem_cgroup_lru_names_not_uptodate(void) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct mcs_total_stat mystat; - int i; - - memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_local_stat(memcg, &mystat); + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); +} +static int memcg_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup *mi; + unsigned int i; - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], + mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], + mem_cgroup_read_events(memcg, i)); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], + mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + /* Hierarchical information */ { unsigned long long limit, memsw_limit; memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); - cb->fill(cb, "hierarchical_memory_limit", limit); + seq_printf(m, "hierarchical_memory_limit %llu\n", limit); if (do_swap_account) - cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); + seq_printf(m, "hierarchical_memsw_limit %llu\n", + memsw_limit); } - memset(&mystat, 0, sizeof(mystat)); - mem_cgroup_get_total_stat(memcg, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) { - if (i == MCS_SWAP && !do_swap_account) + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + long long val = 0; + + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; + seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); + } + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_events(mi, i); + seq_printf(m, "total_%s %llu\n", + mem_cgroup_events_names[i], val); + } + + for (i = 0; i < NR_LRU_LISTS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); } #ifdef CONFIG_DEBUG_VM { int nid, zid; struct mem_cgroup_per_zone *mz; + struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_zoneinfo(memcg, nid, zid); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += - mz->reclaim_stat.recent_rotated[0]; - recent_rotated[1] += - mz->reclaim_stat.recent_rotated[1]; - recent_scanned[0] += - mz->reclaim_stat.recent_scanned[0]; - recent_scanned[1] += - mz->reclaim_stat.recent_scanned[1]; + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; } - cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); - cb->fill(cb, "recent_rotated_file", recent_rotated[1]); - cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); - cb->fill(cb, "recent_scanned_file", recent_scanned[1]); + seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); + seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); + seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); + seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); } #endif return 0; } -static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); return mem_cgroup_swappiness(memcg); } -static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, - u64 val) +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (val > 100) + if (val > 100 || !parent) return -EINVAL; - if (cgrp->parent == NULL) - return -EINVAL; - - parent = mem_cgroup_from_cont(cgrp->parent); - - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) { - cgroup_unlock(); + if ((parent->use_hierarchy) || memcg_has_children(memcg)) { + mutex_unlock(&memcg_create_mutex); return -EINVAL; } memcg->swappiness = val; - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } @@ -4291,7 +5481,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) usage = mem_cgroup_usage(memcg, swap); /* - * current_threshold points to threshold just below usage. + * current_threshold points to threshold just below or equal to usage. * If it's not true, a threshold was crossed after last * call of __mem_cgroup_threshold(). */ @@ -4340,7 +5530,13 @@ static int compare_thresholds(const void *a, const void *b) const struct mem_cgroup_threshold *_a = a; const struct mem_cgroup_threshold *_b = b; - return _a->threshold - _b->threshold; + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; } static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) @@ -4360,13 +5556,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup *cgrp, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -4417,14 +5611,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp, /* Find current threshold */ new->current_threshold = -1; for (i = 0; i < size; i++) { - if (new->entries[i].threshold < usage) { + if (new->entries[i].threshold <= usage) { /* * new->current_threshold will not be used until * rcu_assign_pointer(), so it's safe to increment * it here. */ ++new->current_threshold; - } + } else + break; } /* Free old spare buffer and save old primary buffer as spare */ @@ -4442,13 +5637,23 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, - struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - int type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -4493,7 +5698,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, continue; new->entries[j] = thresholds->primary->entries[i]; - if (new->entries[j].threshold < usage) { + if (new->entries[j].threshold <= usage) { /* * new->current_threshold will not be used * until rcu_assign_pointer(), so it's safe to increment @@ -4521,14 +5726,23 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup *cgrp, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *event; - int type = MEMFILE_TYPE(cft->private); - BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@ -4546,14 +5760,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, - struct cftype *cft, struct eventfd_ctx *eventfd) +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; - int type = MEMFILE_TYPE(cft->private); - - BUG_ON(type != _OOM_TYPE); spin_lock(&memcg_oom_lock); @@ -4567,125 +5777,357 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup *cgrp, - struct cftype *cft, struct cgroup_map_cb *cb) +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - - cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); - if (atomic_read(&memcg->under_oom)) - cb->fill(cb, "under_oom", 1); - else - cb->fill(cb, "under_oom", 0); + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); return 0; } -static int mem_cgroup_oom_control_write(struct cgroup *cgrp, +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!cgrp->parent || !((val == 0) || (val == 1))) + if (!parent || !((val == 0) || (val == 1))) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - - cgroup_lock(); + mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) { - cgroup_unlock(); + if ((parent->use_hierarchy) || memcg_has_children(memcg)) { + mutex_unlock(&memcg_create_mutex); return -EINVAL; } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - cgroup_unlock(); + mutex_unlock(&memcg_create_mutex); return 0; } -#ifdef CONFIG_NUMA -static const struct file_operations mem_control_numa_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_numa_stat_open(struct inode *unused, struct file *file) +#ifdef CONFIG_MEMCG_KMEM +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; + int ret; + + memcg->kmemcg_id = -1; + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; - file->f_op = &mem_control_numa_stat_file_operations; - return single_open(file, mem_control_numa_stat_show, cont); + return mem_cgroup_sockets_init(memcg, ss); +} + +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ + mem_cgroup_sockets_destroy(memcg); } -#endif /* CONFIG_NUMA */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM -static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { + if (!memcg_kmem_is_active(memcg)) + return; + /* - * Part of this would be better living in a separate allocation - * function, leaving us with just the cgroup tree population work. - * We, however, depend on state such as network's proto_list that - * is only initialized after cgroup creation. I found the less - * cumbersome way to deal with it to defer it all to populate time + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. */ - return mem_cgroup_sockets_init(cont, ss); -}; + css_get(&memcg->css); -static void kmem_cgroup_destroy(struct cgroup *cont) -{ - mem_cgroup_sockets_destroy(cont); + memcg_kmem_mark_dead(memcg); + + if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) + return; + + if (memcg_kmem_test_and_clear_dead(memcg)) + css_put(&memcg->css); } #else -static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { return 0; } -static void kmem_cgroup_destroy(struct cgroup *cont) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +static int memcg_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + const char *name; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cfile.file->f_dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); + ret = -EINVAL; + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } + + ret = event->register_event(memcg, event->eventfd, buffer); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return 0; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), - .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, + .read_u64 = mem_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "stat", - .read_map = mem_control_stat_show, + .seq_show = memcg_stat_show, }, { .name = "force_empty", @@ -4693,10 +6135,17 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "use_hierarchy", + .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, { + .name = "cgroup.event_control", /* XXX: for compat */ + .write_string = memcg_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, + { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, @@ -4708,69 +6157,85 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "oom_control", - .read_map = mem_cgroup_oom_control_read, + .seq_show = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .register_event = mem_cgroup_oom_register_event, - .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, + { + .name = "pressure_level", + }, #ifdef CONFIG_NUMA { .name = "numa_stat", - .open = mem_control_numa_stat_open, - .mode = S_IRUGO, + .seq_show = memcg_numa_stat_show, + }, +#endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .trigger = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .seq_show = mem_cgroup_slabinfo_read, }, #endif +#endif + { }, /* terminate */ }; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read_u64 = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), .write_string = mem_cgroup_write, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read_u64 = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, + { }, /* terminate */ }; - -static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) -{ - if (!do_swap_account) - return 0; - return cgroup_add_files(cont, ss, memsw_cgroup_files, - ARRAY_SIZE(memsw_cgroup_files)); -}; -#else -static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) -{ - return 0; -} #endif - static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; - enum lru_list lru; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -4788,32 +6253,29 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - for_each_lru(lru) - INIT_LIST_HEAD(&mz->lruvec.lists[lru]); + lruvec_init(&mz->lruvec); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - int size = sizeof(struct mem_cgroup); + size_t size; - /* Can be very big if MAX_NUMNODES is very big */ - if (size < PAGE_SIZE) - memcg = kzalloc(size, GFP_KERNEL); - else - memcg = vzalloc(size); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) return NULL; @@ -4824,35 +6286,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) return memcg; out_free: - if (size < PAGE_SIZE) - kfree(memcg); - else - vfree(memcg); + kfree(memcg); return NULL; } /* - * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void vfree_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - vfree(memcg); -} -static void vfree_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, vfree_work); - schedule_work(&memcg->work_freeing); -} - -/* * At destroying mem_cgroup, references from swap_cgroup can remain. * (scanning all at force_empty is too costly...) * @@ -4868,36 +6306,25 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; mem_cgroup_remove_from_trees(memcg); - free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); - if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree_rcu(memcg, rcu_freeing); - else - call_rcu(&memcg->rcu_freeing, vfree_rcu); -} - -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - __mem_cgroup_free(memcg); - if (parent) - mem_cgroup_put(parent); - } -} -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); + /* + * We need to make sure that (at least for now), the jump label + * destruction code runs outside of the cgroup lock. This is because + * get_online_cpus(), which is called from the static_branch update, + * can't be called inside the cgroup_lock. cpusets are the ones + * enforcing this dependency, so if they ever change, we might as well. + * + * schedule_work() will guarantee this happens. Be careful if you need + * to move this code around, and make sure it is outside + * the cgroup_lock. + */ + disarm_static_keys(memcg); + kfree(memcg); } /* @@ -4911,19 +6338,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -static void __init enable_swap_cgroup(void) -{ - if (!mem_cgroup_disabled() && really_do_swap_account) - do_swap_account = 1; -} -#else -static void __init enable_swap_cgroup(void) -{ -} -#endif - -static int mem_cgroup_soft_limit_tree_init(void) +static void __init mem_cgroup_soft_limit_tree_init(void) { struct mem_cgroup_tree_per_node *rtpn; struct mem_cgroup_tree_per_zone *rtpz; @@ -4934,8 +6349,7 @@ static int mem_cgroup_soft_limit_tree_init(void) if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - if (!rtpn) - goto err_cleanup; + BUG_ON(!rtpn); soft_limit_tree.rb_tree_per_node[node] = rtpn; @@ -4945,23 +6359,12 @@ static int mem_cgroup_soft_limit_tree_init(void) spin_lock_init(&rtpz->lock); } } - return 0; - -err_cleanup: - for_each_node(node) { - if (!soft_limit_tree.rb_tree_per_node[node]) - break; - kfree(soft_limit_tree.rb_tree_per_node[node]); - soft_limit_tree.rb_tree_per_node[node] = NULL; - } - return 1; - } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg; long error = -ENOMEM; int node; @@ -4974,85 +6377,166 @@ mem_cgroup_create(struct cgroup *cont) goto free_out; /* root ? */ - if (cont->parent == NULL) { - int cpu; - enable_swap_cgroup(); - parent = NULL; - if (mem_cgroup_soft_limit_tree_init()) - goto free_out; + if (parent_css == NULL) { root_mem_cgroup = memcg; - for_each_possible_cpu(cpu) { - struct memcg_stock_pcp *stock = - &per_cpu(memcg_stock, cpu); - INIT_WORK(&stock->work, drain_local_stock); - } - hotcpu_notifier(memcg_cpu_hotplug_callback, 0); - } else { - parent = mem_cgroup_from_cont(cont->parent); - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; - } - - if (parent && parent->use_hierarchy) { - res_counter_init(&memcg->res, &parent->res); - res_counter_init(&memcg->memsw, &parent->memsw); - /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). - */ - mem_cgroup_get(parent); - } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); } + memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - - if (parent) - memcg->swappiness = mem_cgroup_swappiness(parent); - atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); + return &memcg->css; + free_out: __mem_cgroup_free(memcg); return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup *cont) +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); + + if (css->cgroup->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + + if (!parent) + return 0; + + mutex_lock(&memcg_create_mutex); + + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->swappiness = mem_cgroup_swappiness(parent); + + if (parent->use_hierarchy) { + res_counter_init(&memcg->res, &parent->res); + res_counter_init(&memcg->memsw, &parent->memsw); + res_counter_init(&memcg->kmem, &parent->kmem); + + /* + * No need to take a reference to the parent because cgroup + * core guarantees its existence. + */ + } else { + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); + res_counter_init(&memcg->kmem, NULL); + /* + * Deeper hierachy with use_hierarchy == false doesn't make + * much sense so let cgroup subsystem know about this + * unfortunate state in our controller. + */ + if (parent != root_mem_cgroup) + memory_cgrp_subsys.broken_hierarchy = true; + } + mutex_unlock(&memcg_create_mutex); - return mem_cgroup_force_empty(memcg, false); + return memcg_init_kmem(memcg, &memory_cgrp_subsys); } -static void mem_cgroup_destroy(struct cgroup *cont) +/* + * Announce all parents that a group from their hierarchy is gone. + */ +static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *parent = memcg; - kmem_cgroup_destroy(cont); + while ((parent = parent_mem_cgroup(parent))) + mem_cgroup_iter_invalidate(parent); - mem_cgroup_put(memcg); + /* + * if the root memcg is not hierarchical we have to check it + * explicitely. + */ + if (!root_mem_cgroup->use_hierarchy) + mem_cgroup_iter_invalidate(root_mem_cgroup); } -static int mem_cgroup_populate(struct cgroup_subsys *ss, - struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { - int ret; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event, *tmp; + struct cgroup_subsys_state *iter; - ret = cgroup_add_files(cont, ss, mem_cgroup_files, - ARRAY_SIZE(mem_cgroup_files)); + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); - if (!ret) - ret = register_memsw_files(cont, ss); + kmem_cgroup_css_offline(memcg); - if (!ret) - ret = register_kmem_files(cont, ss); + mem_cgroup_invalidate_reclaim_iterators(memcg); - return ret; + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + css_for_each_descendant_post(iter, css) + mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + + mem_cgroup_destroy_all_caches(memcg); + vmpressure_cleanup(&memcg->vmpressure); +} + +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* + * XXX: css_offline() would be where we should reparent all + * memory to prepare the cgroup for destruction. However, + * memcg does not do css_tryget() and res_counter charging + * under the same RCU lock region, which means that charging + * could race with offlining. Offlining only happens to + * cgroups with no tasks in them but charges can show up + * without any tasks from the swapin path when the target + * memcg is looked up from the swapout record and not from the + * current task as it usually is. A race like this can leak + * charges and put pages with stale cgroup pointers into + * circulation: + * + * #0 #1 + * lookup_swap_cgroup_id() + * rcu_read_lock() + * mem_cgroup_lookup() + * css_tryget() + * rcu_read_unlock() + * disable css_tryget() + * call_rcu() + * offline_css() + * reparent_charges() + * res_counter_charge() + * css_put() + * css_free() + * pc->mem_cgroup = dead memcg + * add page to lru + * + * The bulk of the charges are still moved in offline_css() to + * avoid pinning a lot of pages in case a long-term reference + * like a swapout record is deferring the css_free() to long + * after offlining. But this makes sure we catch any charges + * made after offlining: + */ + mem_cgroup_reparent_charges(memcg); + + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU @@ -5099,8 +6583,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, - GFP_KERNEL, 1, &memcg, false); + ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; @@ -5147,7 +6630,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return NULL; if (PageAnon(page)) { /* we don't move shared anon */ - if (!move_anon() || page_mapcount(page) > 2) + if (!move_anon()) return NULL; } else if (!move_file()) /* we ignore mapcount for file pages */ @@ -5158,32 +6641,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, return page; } +#ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { - int usage_count; struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); if (!move_anon() || non_swap_entry(ent)) return NULL; - usage_count = mem_cgroup_count_swap_user(ent, &page); - if (usage_count > 1) { /* we don't move shared anon */ - if (page) - put_page(page); - return NULL; - } + /* + * Because lookup_swap_cache() updates some statistics counter, + * we call find_get_page() with swapper_space directly. + */ + page = find_get_page(swap_address_space(ent), ent.val); if (do_swap_account) entry->val = ent.val; return page; } +#else +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + return NULL; +} +#endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; - struct inode *inode; struct address_space *mapping; pgoff_t pgoff; @@ -5192,7 +6680,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, if (!move_file()) return NULL; - inode = vma->vm_file->f_path.dentry->d_inode; mapping = vma->vm_file->f_mapping; if (pte_none(ptent)) pgoff = linear_page_index(vma, addr); @@ -5200,16 +6687,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(&swapper_space, swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } @@ -5248,7 +6739,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@ -5270,7 +6761,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, enum mc_target_type ret = MC_TARGET_NONE; page = pmd_page(pmd); - VM_BUG_ON(!page || !PageHead(page)); + VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!move_anon()) return ret; pc = lookup_page_cgroup(page); @@ -5299,10 +6790,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -5357,6 +6848,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -5377,7 +6869,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -5387,7 +6881,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -5412,14 +6906,21 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long move_charge_at_immigrate; - if (memcg->move_charge_at_immigrate) { + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_charge_at_immigrate = memcg->move_charge_at_immigrate; + if (move_charge_at_immigrate) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); @@ -5439,6 +6940,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, spin_lock(&mc.lock); mc.from = from; mc.to = memcg; + mc.immigrate_flags = move_charge_at_immigrate; spin_unlock(&mc.lock); /* We set mc.moving_task later */ @@ -5451,7 +6953,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); @@ -5480,9 +6982,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -5491,8 +6993,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (!isolate_lru_page(page)) { pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, - pc, mc.from, mc.to, - false)) { + pc, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } @@ -5500,7 +7001,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -5522,7 +7023,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, 1, pc, - mc.from, mc.to, false)) { + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5533,8 +7034,7 @@ put: /* get_mctgt_type() gets the page */ break; case MC_TARGET_SWAP: ent = target.ent; - if (!mem_cgroup_move_swap_account(ent, - mc.from, mc.to, false)) { + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { mc.precharge--; /* we fixup refcnts and charges later. */ mc.moved_swap++; @@ -5601,7 +7101,7 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); @@ -5610,46 +7110,58 @@ static void mem_cgroup_move_task(struct cgroup *cont, if (mm) { if (mc.to) mem_cgroup_move_charge(mm); - put_swap_token(mm); mmput(mm); } if (mc.to) mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } #endif -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, - .create = mem_cgroup_create, - .pre_destroy = mem_cgroup_pre_destroy, - .destroy = mem_cgroup_destroy, - .populate = mem_cgroup_populate, +/* + * Cgroup retains root cgroups across [un]mount cycles making it necessary + * to verify sane_behavior flag on each mount attempt. + */ +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) +{ + /* + * use_hierarchy is forced with sane_behavior. cgroup core + * guarantees that @root doesn't have any children, so turning it + * on for the root memcg is enough. + */ + if (cgroup_sane_behavior(root_css->cgroup)) + mem_cgroup_from_css(root_css)->use_hierarchy = true; +} + +struct cgroup_subsys memory_cgrp_subsys = { + .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, + .css_offline = mem_cgroup_css_offline, + .css_free = mem_cgroup_css_free, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, + .base_cftypes = mem_cgroup_files, .early_init = 0, - .use_id = 1, }; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { - /* consider enabled if no parameter or 1 is given */ if (!strcmp(s, "1")) really_do_swap_account = 1; else if (!strcmp(s, "0")) @@ -5658,4 +7170,39 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static void __init memsw_file_init(void) +{ + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); +} + +static void __init enable_swap_cgroup(void) +{ + if (!mem_cgroup_disabled() && really_do_swap_account) { + do_swap_account = 1; + memsw_file_init(); + } +} + +#else +static void __init enable_swap_cgroup(void) +{ +} #endif + +/* + * subsys_initcall() for memory controller. + * + * Some parts like hotcpu_notifier() have to be initialized from this context + * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically + * everything that doesn't depend on a specific mem_cgroup structure should + * be initialized from here. + */ +static int __init mem_cgroup_init(void) +{ + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + enable_swap_cgroup(); + mem_cgroup_soft_limit_tree_init(); + memcg_stock_init(); + return 0; +} +subsys_initcall(mem_cgroup_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97cc2733551a..9ccef39a9de2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; int sysctl_memory_failure_recovery __read_mostly = 1; -atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) @@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) * can only guarantee that the page either belongs to the memcg tasks, or is * a freed page. */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); static int hwpoison_filter_task(struct page *p) @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -206,7 +202,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t == current) { si.si_code = BUS_MCEERR_AR; @@ -248,10 +244,12 @@ void shake_page(struct page *p, int access) */ if (access) { int nr; + int nid = page_to_nid(p); do { struct shrink_control shrink = { .gfp_mask = GFP_KERNEL, }; + node_set(nid, shrink.nodes_to_scan); nr = shrink_slab(&shrink, 1000, 1000); if (page_count(p) == 1) @@ -345,14 +343,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * Also when FAIL is set do a force kill because something went * wrong earlier. */ -static void kill_procs(struct list_head *to_kill, int doit, int trapno, +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, int fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { + if (forcekill) { /* * In case something went wrong with munmapping * make sure the process doesn't catch the @@ -400,18 +398,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; + pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; if (!task_early_kill(tsk)) continue; - list_for_each_entry(vmac, &av->head, same_anon_vma) { + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; @@ -420,7 +421,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* @@ -431,7 +432,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +442,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers @@ -607,7 +607,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ @@ -779,16 +779,16 @@ static struct page_state { { compound, compound, "huge", me_huge_page }, #endif - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, + { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, /* @@ -810,14 +810,14 @@ static struct page_state { #undef slab #undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - PageDirty(page) ? "dirty " : "", - msg, action_name[result]); + pr_err("MCE %#lx: %s page recovery: %s\n", + pfn, msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, @@ -852,14 +852,14 @@ static int page_action(struct page_state *ps, struct page *p, * the pages and send SIGBUS to the processes if the data was dirty. */ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, - int trapno, int flags) + int trapno, int flags, struct page **hpagep) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); int ret; - int kill = 1; - struct page *hpage = compound_head(p); + int kill = 1, forcekill; + struct page *hpage = *hpagep; struct page *ppage; if (PageReserved(p) || PageSlab(p)) @@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * be called inside page lock (it's recommended but not enforced). */ mapping = page_mapping(hpage); - if (!PageDirty(hpage) && mapping && + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && mapping_cap_writeback_dirty(mapping)) { if (page_mkclean(hpage)) { SetPageDirty(hpage); @@ -934,6 +934,21 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, BUG_ON(!PageHWPoison(p)); return SWAP_FAIL; } + /* + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount + * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } + lock_page(p); + unlock_page(hpage); + *hpagep = p; + } /* THP is split, so ppage should be the real poisoned page. */ ppage = p; } @@ -950,27 +965,23 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(ppage, &tokill); - if (hpage != ppage) - lock_page(ppage); - ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(ppage)); - if (hpage != ppage) - unlock_page(ppage); - /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely + * was dirty or the process is not restartable, + * otherwise the tokill list is merely * freed. When there was a problem unmapping earlier * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs(&tokill, !!PageDirty(ppage), trapno, + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); return ret; @@ -979,7 +990,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -987,7 +998,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -1017,6 +1028,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page *hpage; int res; unsigned int nr_pages; + unsigned long page_flags; if (!sysctl_memory_failure_recovery) panic("Memory failure from trap %d on page %lx", trapno, pfn); @@ -1035,8 +1047,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_trans_order(hpage); - atomic_long_add(nr_pages, &mce_bad_pages); + /* + * Currently errors on hugetlbfs pages are measured in hugepage units, + * so nr_pages should be 1 << compound_order. OTOH when errors are on + * transparent hugepages, they are supposed to be split and error + * measurement is done in normal page units. So nr_pages should be one + * in this case. + */ + if (PageHuge(p)) + nr_pages = 1 << compound_order(hpage); + else /* normal page or thp */ + nr_pages = 1; + atomic_long_add(nr_pages, &num_poisoned_pages); /* * We need/can do nothing about count=0 pages. @@ -1059,15 +1081,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } else if (PageHuge(hpage)) { /* - * Check "just unpoisoned", "filter hit", and - * "race with other subpage." + * Check "filter hit" and "race with other subpage." */ lock_page(hpage); - if (!PageHWPoison(hpage) - || (hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &mce_bad_pages); - return 0; + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); @@ -1097,8 +1120,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", - DELAYED); + if (flags & MF_COUNT_INCREASED) + action_result(pfn, "free buddy", DELAYED); + else + action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } action_result(pfn, "non LRU", IGNORED); @@ -1115,16 +1140,27 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); /* + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. + */ + page_flags = p->flags; + + /* * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); res = 0; goto out; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); unlock_page(hpage); put_page(hpage); return 0; @@ -1155,8 +1191,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * + * When the raw error page is thp tail page, hpage points to the raw + * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) + != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); res = -EBUSY; goto out; @@ -1172,12 +1212,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } res = -EBUSY; - for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { - res = page_action(ps, p, pfn); + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flagss is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) break; - } - } + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + res = page_action(ps, p, pfn); out: unlock_page(hpage); return res; @@ -1231,10 +1281,10 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else - pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", pfn); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); put_cpu_var(memory_failure_cpu); @@ -1255,7 +1305,10 @@ static void memory_failure_work_func(struct work_struct *work) spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - memory_failure(entry.pfn, entry.trapno, entry.flags); + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + else + memory_failure(entry.pfn, entry.trapno, entry.flags); } } @@ -1305,7 +1358,17 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_trans_order(page); + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (!PageHuge(page) && PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { /* @@ -1319,7 +1382,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_dec(&num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1333,7 +1396,7 @@ int unpoison_memory(unsigned long pfn) */ if (TestClearPageHWPoison(page)) { pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &mce_bad_pages); + atomic_long_sub(nr_pages, &num_poisoned_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); @@ -1341,7 +1404,7 @@ int unpoison_memory(unsigned long pfn) unlock_page(page); put_page(page); - if (freeit) + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); return 0; @@ -1364,7 +1427,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) * that is not free, and 1 for any other page type. * For 1 the page is returned with increased page count, otherwise not. */ -static int get_any_page(struct page *p, unsigned long pfn, int flags) +static int __get_any_page(struct page *p, unsigned long pfn, int flags) { int ret; @@ -1372,40 +1435,49 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. - */ - set_migratetype_isolate(p); - /* * When the target page is a free hugepage, just remove it * from free hugepage list. */ if (!get_page_unless_zero(compound_head(p))) { if (PageHuge(p)) { - pr_info("get_any_page: %#lx free huge page\n", pfn); - ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + pr_info("%s: %#lx free huge page\n", __func__, pfn); + ret = 0; } else if (is_free_buddy_page(p)) { - pr_info("get_any_page: %#lx free buddy page\n", pfn); - /* Set hwpoison bit while page is still isolated */ - SetPageHWPoison(p); + pr_info("%s: %#lx free buddy page\n", __func__, pfn); ret = 0; } else { - pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", - pfn, p->flags); + pr_info("%s: %#lx: unknown zero refcount page type %lx\n", + __func__, pfn, p->flags); ret = -EIO; } } else { /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p); - unlock_memory_hotplug(); + return ret; +} + +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ + int ret = __get_any_page(page, pfn, flags); + + if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = __get_any_page(page, pfn, 0); + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + } return ret; } @@ -1416,117 +1488,68 @@ static int soft_offline_huge_page(struct page *page, int flags) struct page *hpage = compound_head(page); LIST_HEAD(pagelist); - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - + /* + * This double-check of PageHWPoison is to avoid the race with + * memory_failure(). See also comment in __soft_offline_page(). + */ + lock_page(hpage); if (PageHWPoison(hpage)) { + unlock_page(hpage); put_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } + unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - - list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, - true); + list_move(&hpage->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - struct page *page1, *page2; - list_for_each_entry_safe(page1, page2, &pagelist, lru) - put_page(page1); - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); if (ret > 0) ret = -EIO; - return ret; + } else { + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } } -done: - if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - /* keep elevated page count for bad page */ return ret; } -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) +static int __soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - if (PageHuge(page)) - return soft_offline_huge_page(page, flags); - - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - /* - * Page cache page we can handle? + * Check PageHWPoison again inside page lock because PageHWPoison + * is set by memory_failure() outside page lock. Note that + * memory_failure() also double-checks PageHWPoison inside page lock, + * so there's no race between soft_offline_page() and memory_failure(). */ - if (!PageLRU(page)) { - /* - * Try to free it. - */ - put_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = get_any_page(page, pfn, 0); - if (ret < 0) - return ret; - if (ret == 0) - goto done; - } - if (!PageLRU(page)) { - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); - return -EIO; - } - lock_page(page); wait_on_page_writeback(page); - - /* - * Synchronized using the page lock with memory_failure() - */ if (PageHWPoison(page)) { unlock_page(page); put_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } - /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. @@ -1539,9 +1562,10 @@ int soft_offline_page(struct page *page, int flags) */ if (ret == 1) { put_page(page); - ret = 0; pr_info("soft_offline: %#lx: invalidated\n", pfn); - goto done; + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + return 0; } /* @@ -1558,27 +1582,120 @@ int soft_offline_page(struct page *page, int flags) if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, MIGRATE_SYNC); + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; + } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); + SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); + atomic_long_inc(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } - if (ret) - return ret; + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + /* + * The lock_memory_hotplug prevents a race with memory hotplug. + * This is a big hammer, a better would be nicer. + */ + lock_memory_hotplug(); -done: - atomic_long_add(1, &mce_bad_pages); - SetPageHWPoison(page); - /* keep elevated page count for bad page */ + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + + ret = get_any_page(page, pfn, flags); + unlock_memory_hotplug(); + if (ret > 0) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else if (ret == 0) { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } + unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 6105f475fa86..037b812a9531 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,10 @@ #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> +#include <linux/migrate.h> +#include <linux/string.h> +#include <linux/dma-debug.h> +#include <linux/debugfs.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -67,6 +71,10 @@ #include "internal.h" +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -76,7 +84,6 @@ EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(mem_map); #endif -unsigned long num_physpages; /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -86,7 +93,6 @@ unsigned long num_physpages; */ void * high_memory; -EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); /* @@ -182,10 +188,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -201,37 +211,39 @@ static int tlb_next_batch(struct mmu_gather *tlb) * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); + tlb->need_flush_all = 0; + tlb->start = start; + tlb->end = end; tlb->need_flush = 0; - tlb->fast_mode = (num_possible_cpus() == 1); tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif } -void tlb_flush_mmu(struct mmu_gather *tlb) +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { - struct mmu_gather_batch *batch; - - if (!tlb->need_flush) - return; tlb->need_flush = 0; tlb_flush(tlb); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif +} - if (tlb_fast_mode(tlb)) - return; +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; for (batch = &tlb->local; batch; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); @@ -240,6 +252,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb) tlb->active = &tlb->local; } +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + if (!tlb->need_flush) + return; + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. @@ -272,11 +292,6 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) VM_BUG_ON(!tlb->need_flush); - if (tlb_fast_mode(tlb)) { - free_page_and_swap_cache(page); - return 1; /* avoid calling tlb_flush_mmu() */ - } - batch = tlb->active; batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { @@ -284,7 +299,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) return 0; batch = tlb->active; } - VM_BUG_ON(batch->nr > batch->max); + VM_BUG_ON_PAGE(batch->nr > batch->max, page); return batch->max - batch->nr; } @@ -369,30 +384,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ @@ -402,7 +393,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -473,8 +464,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -572,6 +561,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { + spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -592,15 +582,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -691,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) - dump_page(page); + dump_page(page, "bad pte"); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); @@ -699,34 +689,20 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", - (unsigned long)vma->vm_ops->fault); - if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", - (unsigned long)vma->vm_file->f_op->mmap); + printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", + vma->vm_ops->fault); + if (vma->vm_file) + printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", + vma->vm_file->f_op->mmap); dump_stack(); - add_taint(TAINT_BAD_PAGE); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline int is_cow_mapping(vm_flags_t flags) +static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } -#ifndef is_zero_pfn -static inline int is_zero_pfn(unsigned long pfn) -{ - return pfn == zero_pfn; -} -#endif - -#ifndef my_zero_pfn -static inline unsigned long my_zero_pfn(unsigned long addr) -{ - return zero_pfn; -} -#endif - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -871,6 +847,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -1035,6 +1013,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; int ret; /* @@ -1043,7 +1024,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | + VM_PFNMAP | VM_MIXEDMAP))) { if (!vma->anon_vma) return 0; } @@ -1051,12 +1033,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); - if (unlikely(is_pfn_mapping(vma))) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ - ret = track_pfn_vma_copy(vma); + ret = track_pfn_copy(vma); if (ret) return ret; } @@ -1067,8 +1049,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_start(src_mm, addr, end); + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1084,9 +1070,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow_mapping(vma->vm_flags)) - mmu_notifier_invalidate_range_end(src_mm, - vma->vm_start, end); + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } @@ -1142,25 +1127,31 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { - if (pte_dirty(ptent)) + if (pte_dirty(ptent)) { + force_flush = 1; set_page_dirty(page); + } if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) + likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (unlikely(!__tlb_remove_page(tlb, page))) { + force_flush = 1; break; + } continue; } /* @@ -1195,16 +1186,34 @@ again: add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); + + /* Do the actual TLB flush before dropping ptl */ + if (force_flush) { + unsigned long old_end; + + /* + * Flush the TLB just for the previous segment, + * then update the range to be the remaining + * TLB range. + */ + old_end = tlb->end; + tlb->end = addr; + tlb_flush_mmu_tlbonly(tlb); + tlb->start = addr; + tlb->end = old_end; + } pte_unmap_unlock(start_pte, ptl); /* - * mmu_gather ran out of room to batch pages, we break out of - * the PTE lock to avoid doing the potential expensive TLB invalidate - * and page-free while holding it. + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. */ if (force_flush) { force_flush = 0; - tlb_flush_mmu(tlb); + tlb_flush_mmu_free(tlb); + if (addr != end) goto again; } @@ -1225,8 +1234,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); - split_huge_page_pmd(vma->vm_mm, pmd); +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif + split_huge_page_pmd(vma, addr, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1295,7 +1312,7 @@ static void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); @@ -1307,11 +1324,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (end <= vma->vm_start) return; - if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; + if (vma->vm_file) + uprobe_munmap(vma, start, end); - if (unlikely(is_pfn_mapping(vma))) - untrack_pfn_vma(vma, 0, 0); + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1319,15 +1336,18 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ - if (vma->vm_file) - unmap_hugepage_range(vma, start, end, NULL); + if (vma->vm_file) { + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + __unmap_hugepage_range_final(tlb, vma, start, end, NULL); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + } } else unmap_page_range(tlb, vma, start, end, details); } @@ -1339,8 +1359,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping - * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here - * @details: details of nonlinear truncation or shared cache invalidation * * Unmap all pages in the vma list. * @@ -1355,40 +1373,40 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *details) + unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) - unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted, - details); + unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); } /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap + * @start: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation * * Caller must protect the VMA list */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - unsigned long end = address + size; - unsigned long nr_accounted = 0; + unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); - tlb_finish_mmu(&tlb, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, end, details); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } /** @@ -1406,13 +1424,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = address + size; - unsigned long nr_accounted = 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); - unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details); + unmap_single_vma(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(mm, address, end); tlb_finish_mmu(&tlb, address, end); } @@ -1441,10 +1458,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, EXPORT_SYMBOL_GPL(zap_vma_ptes); /** - * follow_page - look up a page descriptor from a user-virtual address + * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * @@ -1452,8 +1470,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { pgd_t *pgd; pud_t *pud; @@ -1463,6 +1482,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *page; struct mm_struct *mm = vma->vm_mm; + *page_mask = 0; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); @@ -1478,7 +1499,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); + if (flags & FOLL_GET) + goto out; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } @@ -1489,28 +1511,43 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } goto out; } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - spin_lock(&mm->page_table_lock); + ptl = pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { - page = follow_trans_huge_pmd(mm, address, + page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* fall through */ } split_fallthrough: @@ -1520,7 +1557,25 @@ split_fallthrough: ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1559,12 +1614,12 @@ split_fallthrough: if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */ /* - * Because we lock page here and migration is - * blocked by the pte's page reference, we need - * only check for file-cache page truncation. + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. */ - if (page->mapping) - mlock_vma_page(page); + mlock_vma_page(page); unlock_page(page); } } @@ -1652,27 +1707,32 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { - int i; + long i; unsigned long vm_flags; + unsigned int page_mask; - if (nr_pages <= 0) + if (!nr_pages) return 0; VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. + /* + * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault + * would be called on PROT_NONE ranges. We must never invoke + * handle_mm_fault on PROT_NONE ranges or the NUMA hinting + * page faults would unprotect the PROT_NONE ranges if + * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd + * bitflag. So to avoid that, don't set FOLL_NUMA if + * FOLL_FORCE is set. */ - vm_flags = (gup_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (gup_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + i = 0; do { @@ -1688,7 +1748,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) - return i ? : -EFAULT; + goto efault; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); else @@ -1698,12 +1758,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) - return i ? : -EFAULT; + goto efault; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } vma = get_gate_vma(mm); if (pages) { @@ -1716,20 +1776,53 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, page = pte_page(*pte); else { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } } pages[i] = page; get_page(page); } pte_unmap(pte); + page_mask = 0; goto next_page; } - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - return i ? : -EFAULT; + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, @@ -1740,6 +1833,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, do { struct page *page; unsigned int foll_flags = gup_flags; + unsigned int page_increm; /* * If we have a pending SIGKILL, don't keep faulting @@ -1749,7 +1843,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? i : -ERESTARTSYS; cond_resched(); - while (!(page = follow_page(vma, start, foll_flags))) { + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { int ret; unsigned int fault_flags = 0; @@ -1781,7 +1876,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; } if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; + goto efault; BUG(); } @@ -1823,16 +1918,24 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); + page_mask = 0; } next_page: - if (vmas) + if (vmas) { vmas[i] = vma; - i++; - start += PAGE_SIZE; - nr_pages--; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; +efault: + return i ? : -EFAULT; } EXPORT_SYMBOL(__get_user_pages); @@ -1867,12 +1970,17 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) { struct vm_area_struct *vma; + vm_flags_t vm_flags; int ret; vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) @@ -1900,9 +2008,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * @start: starting user address * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. This will result in the page being COWed even - * in MAP_SHARED mappings. You do not want this. + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -1943,9 +2050,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * * See also get_user_pages_fast, for performance critical applications. */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) { int flags = FOLL_TOUCH; @@ -2068,6 +2175,11 @@ out: * ask for a shared writable mapping! * * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -2076,7 +2188,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, return -EFAULT; if (!page_count(page)) return -EINVAL; - vma->vm_flags |= VM_INSERTPAGE; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); @@ -2115,7 +2231,7 @@ out: * @addr: target user address of this page * @pfn: source kernel pfn * - * Similar to vm_inert_page, this allows drivers to insert individual pages + * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and @@ -2145,14 +2261,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) + if (track_pfn_insert(vma, &pgprot, pfn)) return -EINVAL; ret = insert_pfn(vma, addr, pfn, pgprot); - if (ret) - untrack_pfn_vma(vma, pfn, PAGE_SIZE); - return ret; } EXPORT_SYMBOL(vm_insert_pfn); @@ -2273,37 +2386,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. */ - if (addr == vma->vm_start && end == vma->vm_end) { + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; vma->vm_pgoff = pfn; - vma->vm_flags |= VM_PFN_AT_MMAP; - } else if (is_cow_mapping(vma->vm_flags)) - return -EINVAL; - - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + } - err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); - if (err) { - /* - * To indicate that track_pfn related cleanup is not - * needed from higher level routine calling unmap_vmas - */ - vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); - vma->vm_flags &= ~VM_PFN_AT_MMAP; + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) return -EINVAL; - } + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2318,12 +2424,59 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) @@ -2452,6 +2605,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { + debug_dma_assert_idle(src); + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by @@ -2477,6 +2632,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo } /* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -2499,11 +2686,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; + unsigned long mmun_start = 0; /* For mmu_notifiers */ + unsigned long mmun_end = 0; /* For mmu_notifiers */ old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2556,42 +2745,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; int tmp; - - vmf.virtual_address = (void __user *)(address & - PAGE_MASK); - vmf.pgoff = old_page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = old_page; - - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(old_page); - if (!old_page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(old_page); - goto unwritable_page; - } - } else - VM_BUG_ON(!PageLocked(old_page)); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -2611,6 +2773,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (old_page) + page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); + flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2628,11 +2798,14 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * __do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } put_page(dirty_page); if (page_mkwrite) { @@ -2650,10 +2823,6 @@ reuse: } } - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - return ret; } @@ -2679,9 +2848,13 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; + mmun_start = address & PAGE_MASK; + mmun_end = mmun_start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Re-check the pte - we dropped the lock */ @@ -2748,6 +2921,8 @@ gotten: page_cache_release(new_page); unlock: pte_unmap_unlock(page_table, ptl); + if (mmun_end > mmun_start) + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2764,18 +2939,9 @@ unlock: oom_free_new: page_cache_release(new_page); oom: - if (old_page) { - if (page_mkwrite) { - unlock_page(old_page); - page_cache_release(old_page); - } + if (old_page) page_cache_release(old_page); - } return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2785,18 +2951,17 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) @@ -2823,7 +2988,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2867,7 +3032,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); @@ -2885,7 +3050,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page, *swapcache = NULL; + struct page *page, *swapcache; swp_entry_t entry; pte_t pte; int locked; @@ -2911,7 +3076,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2937,10 +3101,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + swapcache = page; goto out_release; } + swapcache = page; locked = lock_page_or_retry(page, mm, flags); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; @@ -2956,16 +3123,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - if (ksm_might_need_to_copy(page, vma, address)) { - swapcache = page; - page = ksm_does_need_to_copy(page, vma, address); - - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - swapcache = NULL; - goto out_page; - } + page = ksm_might_need_to_copy(page, vma, address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { @@ -3009,8 +3171,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); - do_page_add_anon_rmap(page, vma, address, exclusive); + if (page == swapcache) + do_page_add_anon_rmap(page, vma, address, exclusive); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, address); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -3018,7 +3185,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (swapcache) { + if (page != swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3051,7 +3218,7 @@ out_page: unlock_page(page); out_release: page_cache_release(page); - if (swapcache) { + if (page != swapcache) { unlock_page(swapcache); page_cache_release(swapcache); } @@ -3127,9 +3294,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3160,53 +3332,11 @@ oom: return VM_FAULT_OOM; } -/* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, struct page **page) { - pte_t *page_table; - spinlock_t *ptl; - struct page *page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; struct vm_fault vmf; int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; @@ -3214,147 +3344,304 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) - goto uncharge_out; + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - ret = VM_FAULT_HWPOISON; - goto uncharge_out; + page_cache_release(vmf.page); + return VM_FAULT_HWPOISON; } - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else - VM_BUG_ON(!PageLocked(vmf.page)); + VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + + *page = vmf.page; + return ret; +} + +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) + pte_mksoft_dirty(entry); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); + } + set_pte_at(vma->vm_mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} + +#define FAULT_AROUND_ORDER 4 + +#ifdef CONFIG_DEBUG_FS +static unsigned int fault_around_order = FAULT_AROUND_ORDER; + +static int fault_around_order_get(void *data, u64 *val) +{ + *val = fault_around_order; + return 0; +} + +static int fault_around_order_set(void *data, u64 val) +{ + BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); + if (1UL << val > PTRS_PER_PTE) + return -EINVAL; + fault_around_order = val; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, + fault_around_order_get, fault_around_order_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, + &fault_around_order_fops); + if (!ret) + pr_warn("Failed to create fault_around_order in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); + +static inline unsigned long fault_around_pages(void) +{ + return 1UL << fault_around_order; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); +} +#else +static inline unsigned long fault_around_pages(void) +{ + unsigned long nr_pages; + + nr_pages = 1UL << FAULT_AROUND_ORDER; + BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); + return nr_pages; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); +} +#endif + +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + start_addr = max(address & fault_around_mask(), vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; /* - * Should we do an early C-O-W break? + * max_pgoff is either end of page table or end of vma + * or fault_around_pages() from pgoff, depending what is neast. */ - page = vmf.page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, vmf.page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - int tmp; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON(!PageLocked(page)); - page_mkwrite = 1; - } - } - + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + fault_around_pages() - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t *pte; + int ret = 0; /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } - } - set_pte_at(mm, address, page_table, entry); + if (vma->vm_ops->map_pages) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } + do_set_pte(vma, address, fault_page, pte, false, false); + unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); + return ret; +} - pte_unmap_unlock(page_table, ptl); +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + spinlock_t *ptl; + pte_t *pte; + int ret; - if (dirty_page) { - struct address_space *mapping = page->mapping; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; - if (set_page_dirty(dirty_page)) - page_mkwrite = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if (page_mkwrite && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - } else { - unlock_page(vmf.page); - if (anon) - page_cache_release(vmf.page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { + page_cache_release(new_page); + return VM_FAULT_OOM; } - return ret; + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; -unwritable_page: - page_cache_release(page); + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + goto uncharge_out; + } + do_set_pte(vma, address, new_page, pte, true, true); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); return ret; uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); + mem_cgroup_uncharge_page(new_page); + page_cache_release(new_page); + return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + struct address_space *mapping; + spinlock_t *ptl; + pte_t *pte; + int dirtied = 0; + int ret, tmp; + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(fault_page); + return tmp; + } } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, true, false); + pte_unmap_unlock(pte, ptl); + + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); + return ret; } @@ -3366,7 +3653,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3398,7 +3691,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, + int *flags) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == numa_node_id()) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } + + return mpol_misplaced(page, vma, addr); +} + +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page = NULL; + spinlock_t *ptl; + int page_nid = -1; + int last_cpupid; + int target_nid; + bool migrated = false; + int flags = 0; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * ptep_modify_prot_start is not called as this is clearing + * the _PAGE_NUMA bit and it is not really expected that there + * would be concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + pte = pte_mknonnuma(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + BUG_ON(is_zero_pfn(page_to_pfn(page))); + + /* + * Avoid grouping on DSO/COW pages in specific and RO pages + * in general, RO pages shouldn't hurt as much anyway since + * they can be in shared cache state. + */ + if (!pte_write(pte)) + flags |= TNF_NO_GROUP; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); + pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } + +out: + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, 1, flags); + return 0; } /* @@ -3414,7 +3803,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -int handle_pte_fault(struct mm_struct *mm, +static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { @@ -3439,6 +3828,9 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } + if (pte_numa(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3470,22 +3862,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3497,28 +3881,55 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + int ret = VM_FAULT_FALLBACK; if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + ret = do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { pmd_t orig_pmd = *pmd; + int ret; + barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); - return 0; + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + /* + * If the pmd is splitting, return and retry the + * the fault. Alternative: wait until the split + * is done, and goto retry. + */ + if (pmd_trans_splitting(orig_pmd)) + return 0; + + if (pmd_numa(orig_pmd)) + return do_huge_pmd_numa_page(mm, vma, address, + orig_pmd, pmd); + + if (dirty && !pmd_write(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pmd_set_accessed(mm, vma, address, pmd, + orig_pmd, dirty); + return 0; + } } } + /* THP should already have been handled */ + BUG_ON(pmd_numa(*pmd)); + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) @@ -3534,6 +3945,43 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_oom_enable(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. @@ -3587,30 +4035,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -ENOMEM; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; - BUG_ON(addr >= end); - BUG_ON(end > vma->vm_end); - len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -EFAULT; -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) @@ -3776,6 +4200,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, return len; } +EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* @@ -3898,24 +4323,21 @@ void print_vma_addr(char *prefix, unsigned long ip) struct file *f = vma->vm_file; char *buf = (char *)__get_free_page(GFP_KERNEL); if (buf) { - char *p, *s; + char *p; p = d_path(&f->f_path, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; - s = strrchr(p, '/'); - if (s) - p = s+1; - printk("%s%s[%lx+%lx]", prefix, p, + printk("%s%s[%lx+%lx]", prefix, kbasename(p), vma->vm_start, vma->vm_end - vma->vm_start); free_page((unsigned long)buf); } } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); } -#ifdef CONFIG_PROVE_LOCKING +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void might_fault(void) { /* @@ -3927,13 +4349,17 @@ void might_fault(void) if (segment_eq(get_fs(), KERNEL_DS)) return; - might_sleep(); /* * it would be nicer only to annotate paths which are not under * pagefault_disable, however that requires a larger audit and * providing helpers like get_user_atomic. */ - if (!in_atomic() && current->mm) + if (in_atomic()) + return; + + __might_sleep(__FILE__, __LINE__, 0); + + if (current->mm) might_lock_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL(might_fault); @@ -4009,3 +4435,30 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kmem_cache_free(page_ptl_cachep, page->ptl); +} +#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6629fafd6ce4..a650db29606f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -9,7 +9,6 @@ #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> -#include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/pagevec.h> @@ -29,6 +28,9 @@ #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> +#include <linux/stop_machine.h> +#include <linux/hugetlb.h> +#include <linux/memblock.h> #include <asm/tlbflush.h> @@ -50,14 +52,10 @@ DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) { mutex_lock(&mem_hotplug_mutex); - - /* for exclusive hibernation if CONFIG_HIBERNATION=y */ - lock_system_sleep(); } void unlock_memory_hotplug(void) { - unlock_system_sleep(); mutex_unlock(&mem_hotplug_mutex); } @@ -74,8 +72,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %llx - %llx cannot be added\n", - (unsigned long long)res->start, (unsigned long long)res->end); + pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } @@ -92,9 +89,8 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { page->lru.next = (struct list_head *) type; SetPagePrivate(page); @@ -102,9 +98,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, atomic_inc(&page->_count); } -/* reference to __meminit __free_pages_bootmem is valid - * so use __ref to tell modpost not to generate a warning */ -void __ref put_page_bootmem(struct page *page) +void put_page_bootmem(struct page *page) { unsigned long type; @@ -116,20 +110,18 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - __free_pages_bootmem(page, 0); + free_reserved_page(page); } - } +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +#ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -157,6 +149,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void register_page_bootmem_info_node(struct pglist_data *pgdat) { @@ -173,7 +191,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) zone = &pgdat->node_zones[0]; for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone->wait_table) { + if (zone_is_initialized(zone)) { nr_pages = zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t); nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; @@ -185,14 +203,21 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } pfn = pgdat->node_start_pfn; - end_pfn = pfn + pgdat->node_spanned_pages; - - /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); + end_pfn = pgdat_end_pfn(pgdat); + /* register section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other nodes. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) @@ -201,8 +226,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); - old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (start_pfn < zone->zone_start_pfn) + old_zone_end_pfn = zone_end_pfn(zone); + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -211,13 +236,138 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ +static int __ref ensure_zone_is_initialized(struct zone *zone, + unsigned long start_pfn, unsigned long num_pages) +{ + if (!zone_is_initialized(zone)) + return init_currently_empty_zone(zone, start_pfn, num_pages, + MEMMAP_HOTPLUG); + return 0; +} + +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z1_start_pfn; + + ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > zone_end_pfn(z2)) + goto out_fail; + /* the move out part must be at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (!zone_is_empty(z1)) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); + resize_zone(z2, end_pfn, zone_end_pfn(z2)); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z2_end_pfn; + + ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (zone_end_pfn(z1) > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= zone_end_pfn(z1)) + goto out_fail; + + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (!zone_is_empty(z2)) + z2_end_pfn = zone_end_pfn(z2); + else + z2_end_pfn = end_pfn; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2_end_pfn); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long old_pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - if (start_pfn < pgdat->node_start_pfn) + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - @@ -231,16 +381,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nid = pgdat->node_id; int zone_type; unsigned long flags; + int ret; zone_type = zone - pgdat->node_zones; - if (!zone->wait_table) { - int ret; + ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); + if (ret) + return ret; - ret = init_currently_empty_zone(zone, phys_start_pfn, - nr_pages, MEMMAP_HOTPLUG); - if (ret) - return ret; - } pgdat_resize_lock(zone->zone_pgdat, &flags); grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, @@ -254,13 +401,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) static int __meminit __add_section(int nid, struct zone *zone, unsigned long phys_start_pfn) { - int nr_pages = PAGES_PER_SECTION; int ret; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + ret = sparse_add_one_section(zone, phys_start_pfn); if (ret < 0) return ret; @@ -273,36 +419,6 @@ static int __meminit __add_section(int nid, struct zone *zone, return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } -#ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __remove_section(struct zone *zone, struct mem_section *ms) -{ - /* - * XXX: Freeing memmap with vmemmap is not implement yet. - * This should be removed later. - */ - return -EBUSY; -} -#else -static int __remove_section(struct zone *zone, struct mem_section *ms) -{ - unsigned long flags; - struct pglist_data *pgdat = zone->zone_pgdat; - int ret = -EINVAL; - - if (!valid_section(ms)) - return ret; - - ret = unregister_memory_section(ms); - if (ret) - return ret; - - pgdat_resize_lock(pgdat, &flags); - sparse_remove_one_section(zone, ms); - pgdat_resize_unlock(pgdat, &flags); - return 0; -} -#endif - /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will @@ -336,6 +452,230 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__add_pages); +#ifdef CONFIG_MEMORY_HOTREMOVE +/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ +static int find_smallest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(start_pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(start_pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(start_pfn))) + continue; + + return start_pfn; + } + + return 0; +} + +/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ +static int find_biggest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + unsigned long pfn; + + /* pfn is the end pfn of a memory section. */ + pfn = end_pfn - 1; + for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(pfn))) + continue; + + return pfn; + } + + return 0; +} + +static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ + unsigned long zone_end_pfn = z; + unsigned long pfn; + struct mem_section *ms; + int nid = zone_to_nid(zone); + + zone_span_writelock(zone); + if (zone_start_pfn == start_pfn) { + /* + * If the section is smallest section in the zone, it need + * shrink zone->zone_start_pfn and zone->zone_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, zone, end_pfn, + zone_end_pfn); + if (pfn) { + zone->zone_start_pfn = pfn; + zone->spanned_pages = zone_end_pfn - pfn; + } + } else if (zone_end_pfn == end_pfn) { + /* + * If the section is biggest section in the zone, it need + * shrink zone->spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + start_pfn); + if (pfn) + zone->spanned_pages = pfn - zone_start_pfn + 1; + } + + /* + * The section is not biggest or smallest mem_section in the zone, it + * only creates a hole in the zone. So in this case, we need not + * change the zone. But perhaps, the zone has only hole data. Thus + * it check the zone has only hole or not. + */ + pfn = zone_start_pfn; + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (page_zone(pfn_to_page(pfn)) != zone) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + zone_span_writeunlock(zone); + return; + } + + /* The zone has no valid section */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + zone_span_writeunlock(zone); +} + +static void shrink_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ + unsigned long pgdat_end_pfn = p; + unsigned long pfn; + struct mem_section *ms; + int nid = pgdat->node_id; + + if (pgdat_start_pfn == start_pfn) { + /* + * If the section is smallest section in the pgdat, it need + * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, NULL, end_pfn, + pgdat_end_pfn); + if (pfn) { + pgdat->node_start_pfn = pfn; + pgdat->node_spanned_pages = pgdat_end_pfn - pfn; + } + } else if (pgdat_end_pfn == end_pfn) { + /* + * If the section is biggest section in the pgdat, it need + * shrink pgdat->node_spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, + start_pfn); + if (pfn) + pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; + } + + /* + * If the section is not biggest or smallest mem_section in the pgdat, + * it only creates a hole in the pgdat. So in this case, we need not + * change the pgdat. + * But perhaps, the pgdat has only hole data. Thus it check the pgdat + * has only hole or not. + */ + pfn = pgdat_start_pfn; + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + return; + } + + /* The pgdat has no valid section */ + pgdat->node_start_pfn = 0; + pgdat->node_spanned_pages = 0; +} + +static void __remove_zone(struct zone *zone, unsigned long start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int zone_type; + unsigned long flags; + + zone_type = zone - pgdat->node_zones; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); + shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long start_pfn; + int scn_nr; + int ret = -EINVAL; + + if (!valid_section(ms)) + return ret; + + ret = unregister_memory_section(ms); + if (ret) + return ret; + + scn_nr = __section_nr(ms); + start_pfn = section_nr_to_pfn(scn_nr); + __remove_zone(zone, start_pfn); + + sparse_remove_one_section(zone, ms); + return 0; +} + /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed @@ -350,8 +690,10 @@ EXPORT_SYMBOL_GPL(__add_pages); int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { - unsigned long i, ret = 0; + unsigned long i; int sections_to_remove; + resource_size_t start, size; + int ret = 0; /* * We can only remove entire sections @@ -359,11 +701,19 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - release_mem_region(pfn << PAGE_SHIFT, - PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; @@ -371,6 +721,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, return ret; } EXPORT_SYMBOL_GPL(__remove_pages); +#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { @@ -408,29 +759,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void __online_page_set_limits(struct page *page) { - unsigned long pfn = page_to_pfn(page); - - if (pfn >= num_physpages) - num_physpages = pfn + 1; } EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { - totalram_pages++; - -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages++; -#endif + adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); @@ -457,9 +797,101 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +#ifdef CONFIG_MOVABLE_NODE +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ +static bool can_online_high_movable(struct zone *zone) +{ + return true; +} +#else /* CONFIG_MOVABLE_NODE */ +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); +} +#endif /* CONFIG_MOVABLE_NODE */ -int __ref online_pages(unsigned long pfn, unsigned long nr_pages) +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) { + int nid = zone_to_nid(zone); + enum zone_type zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * if the memory to be online is in a zone of 0...zone_last, and + * the zones of 0...zone_last don't have memory before online, we will + * need to set the node to node_states[N_NORMAL_MEMORY] after + * the memory is online. + */ + if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; + else + arg->status_change_nid_normal = -1; + +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid_high = nid; + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + + /* + * if the node don't have memory befor online, we will need to + * set the node to node_states[N_MEMORY] after the memory + * is online. + */ + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + else + arg->status_change_nid = -1; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + if (arg->status_change_nid_high >= 0) + node_set_state(node, N_HIGH_MEMORY); + + node_set_state(node, N_MEMORY); +} + + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +{ + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -468,13 +900,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) struct memory_notify arg; lock_memory_hotplug(); + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + + if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + !can_online_high_movable(zone)) { + unlock_memory_hotplug(); + return -EINVAL; + } + + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -EINVAL; + } + } + if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -EINVAL; + } + } + + /* Previous code may changed the zone of the pfn range */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; + node_states_check_changes_online(nr_pages, zone, &arg); - nid = page_to_nid(pfn_to_page(pfn)); - if (node_present_pages(nid) == 0) - arg.status_change_nid = nid; + nid = pfn_to_nid(pfn); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -484,46 +943,51 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) return ret; } /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ mutex_lock(&zonelists_mutex); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { need_zonelists_rebuild = 1; + build_all_zonelists(NULL, zone); + } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + if (need_zonelists_rebuild) + zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages %lx at %lx failed\n", - nr_pages, pfn); + printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) + << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); unlock_memory_hotplug(); return ret; } zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; - if (need_zonelists_rebuild) - build_all_zonelists(zone); - else - zone_pcp_update(zone); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + if (onlined_pages) { + node_states_set_node(zone_to_nid(zone), &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL, NULL); + else + zone_pcp_update(zone); + } mutex_unlock(&zonelists_mutex); init_per_zone_wmark_min(); - if (onlined_pages) { + if (onlined_pages) kswapd_run(zone_to_nid(zone)); - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); - } vm_total_pages = nr_free_pagecache_pages(); @@ -545,11 +1009,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = start >> PAGE_SHIFT; - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; + pgdat = NODE_DATA(nid); + if (!pgdat) { + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; - arch_refresh_nodedata(nid, pgdat); + arch_refresh_nodedata(nid, pgdat); + } /* we can use NODE_DATA(nid) from here */ @@ -561,7 +1028,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); return pgdat; @@ -575,17 +1042,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -/* +/** + * try_online_node - online a node if offlined + * * called by cpu_up() to online a node without onlined memory. */ -int mem_online_node(int nid) +int try_online_node(int nid) { pg_data_t *pgdat; int ret; + if (node_online(nid)) + return 0; + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } @@ -593,32 +1066,65 @@ int mem_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + out: unlock_memory_hotplug(); return ret; } +static int check_hotplug_memory_range(u64 start, u64 size) +{ + u64 start_pfn = start >> PAGE_SHIFT; + u64 nr_pages = size >> PAGE_SHIFT; + + /* Memory range must be aligned with section */ + if ((start_pfn & ~PAGE_SECTION_MASK) || + (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { + pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", + (unsigned long long)start, + (unsigned long long)size); + return -EINVAL; + } + + return 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; - int new_pgdat = 0; + bool new_pgdat; + bool new_node; struct resource *res; int ret; - lock_memory_hotplug(); + ret = check_hotplug_memory_range(start, size); + if (ret) + return ret; res = register_memory_resource(start, size); ret = -EEXIST; if (!res) - goto out; + return ret; - if (!node_online(nid)) { + { /* Stupid hack to suppress address-never-null warning */ + void *p = NODE_DATA(nid); + new_pgdat = !p; + } + + lock_memory_hotplug(); + + new_node = !node_online(nid); + if (new_node) { pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) - goto out; - new_pgdat = 1; + goto error; } /* call arch's memory hotadd */ @@ -630,7 +1136,7 @@ int __ref add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); - if (new_pgdat) { + if (new_node) { ret = register_one_node(nid); /* * If sysfs file of new node can't create, cpu on the node @@ -649,8 +1155,7 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - if (res) - release_memory_resource(res); + release_memory_resource(res); out: unlock_memory_hotplug(); @@ -733,10 +1238,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) } /* - * Scanning pfn is much easier than scanning lru list. - * Scan pfn from start to end and Find LRU page. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages + * and hugepages). We scan pfn because it's much easier than scanning over + * linked list. This function returns the pfn of the first found movable + * page if it's found, otherwise 0. */ -static unsigned long scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -745,18 +1252,18 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (PageHuge(page)) { + if (is_hugepage_active(page)) + return pfn; + else + pfn = round_up(pfn + 1, + 1 << compound_order(page)) - 1; + } } } return 0; } -static struct page * -hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) -{ - /* This should be improooooved!! */ - return alloc_page(GFP_HIGHUSER_MOVABLE); -} - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) @@ -772,6 +1279,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); + + if (PageHuge(page)) { + struct page *head = compound_head(page); + pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; + if (compound_order(head) > PFN_SECTION_SHIFT) { + ret = -EBUSY; + break; + } + if (isolate_huge_page(page, &source)) + move_pages -= 1 << compound_order(head); + continue; + } + if (!get_page_unless_zero(page)) continue; /* @@ -790,7 +1310,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) #ifdef CONFIG_DEBUG_VM printk(KERN_ALERT "removing pfn %lx from LRU failed\n", pfn); - dump_page(page); + dump_page(page, "failed to remove from LRU"); #endif put_page(page); /* Because we don't have big zone->lock. we should @@ -804,14 +1324,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } if (!list_empty(&source)) { if (not_managed) { - putback_lru_pages(&source); + putback_movable_pages(&source); goto out; } - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, MIGRATE_SYNC); + + /* + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ + ret = migrate_pages(&source, alloc_migrate_target, 0, + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) - putback_lru_pages(&source); + putback_movable_pages(&source); } out: return ret; @@ -844,7 +1368,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, { int ret; long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); offlined = nr_pages; if (!ret) *(long *)data += offlined; @@ -864,16 +1388,173 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -static int __ref offline_pages(unsigned long start_pfn, +#ifdef CONFIG_MOVABLE_NODE +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + return true; +} +#else /* CONFIG_MOVABLE_NODE */ +/* ensure the node has NORMAL memory if it is still online */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + if (present_pages > nr_pages) + return true; + + present_pages = 0; + for (; zt <= ZONE_MOVABLE; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + /* + * we can't offline the last normal memory until all + * higher memory is offlined. + */ + return present_pages == 0; +} +#endif /* CONFIG_MOVABLE_NODE */ + +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + memblock_set_bottom_up(true); + movable_node_enabled = true; +#else + pr_warn("movable_node option not supported\n"); +#endif + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt, zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is in a zone of 0...zone_last, + * and it is the last present memory, 0...zone_last will + * become empty after offline , thus we can determind we will + * need to clear the node from node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + else + arg->status_change_nid_normal = -1; + +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_high = zone_to_nid(zone); + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + + /* + * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + */ + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_HIGH_MEMORY] will be changed + * If we try to offline the last present @nr_pages from the node, + * we can determind we will need to clear the node from + * node_states[N_HIGH_MEMORY]. + */ + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); + else + arg->status_change_nid = -1; +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if ((N_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid_high >= 0)) + node_clear_state(node, N_HIGH_MEMORY); + + if ((N_MEMORY != N_HIGH_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_MEMORY); +} + +static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; - BUG_ON(start_pfn >= end_pfn); /* at least, alignment against pageblock is necessary */ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) return -EINVAL; @@ -890,16 +1571,19 @@ static int __ref offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; + ret = -EINVAL; + if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) + goto out; + /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn); + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, true); if (ret) goto out; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; - if (nr_pages >= node_present_pages(node)) - arg.status_change_nid = node; + node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); @@ -925,8 +1609,8 @@ repeat: drain_all_pages(); } - pfn = scan_lru_pages(start_pfn, end_pfn); - if (pfn) { /* We have page on LRU */ + pfn = scan_movable_pages(start_pfn, end_pfn); + if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); if (!ret) { drain = 1; @@ -940,11 +1624,16 @@ repeat: goto repeat; } } - /* drain all zone's lru pagevec, this is asyncronous... */ + /* drain all zone's lru pagevec, this is asynchronous... */ lru_add_drain_all(); yield(); - /* drain pcp pages , this is synchrouns. */ + /* drain pcp pages, this is synchronous. */ drain_all_pages(); + /* + * dissolve free hugepages in the memory block before doing offlining + * actually in order to make hugetlbfs's object counting consistent. + */ + dissolve_free_huge_pages(start_pfn, end_pfn); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { @@ -952,22 +1641,32 @@ repeat: goto failed_removal; } printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); - /* Ok, all of our target is islaoted. + /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; - totalram_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); - if (!node_present_pages(node)) { - node_clear_state(node, N_HIGH_MEMORY); + if (!populated_zone(zone)) { + zone_pcp_reset(zone); + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } else + zone_pcp_update(zone); + + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) kswapd_stop(node); - } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -977,29 +1676,240 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining %lx to %lx failed\n", - start_pfn, end_pfn); + printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ - undo_isolate_page_range(start_pfn, end_pfn); + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: unlock_memory_hotplug(); return ret; } -int remove_memory(u64 start, u64 size) +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +/** + * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) + * @start_pfn: start pfn of the memory range + * @end_pfn: end pfn of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present mem sections in range + * [start_pfn, end_pfn) and call func on each mem section. + * + * Returns the return value of func. + */ +int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, + void *arg, int (*func)(struct memory_block *, void *)) +{ + struct memory_block *mem = NULL; + struct mem_section *section; + unsigned long pfn, section_nr; + int ret; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + section_nr = pfn_to_section_nr(pfn); + if (!present_section_nr(section_nr)) + continue; + + section = __nr_to_section(section_nr); + /* same memblock? */ + if (mem) + if ((section_nr >= mem->start_section_nr) && + (section_nr <= mem->end_section_nr)) + continue; + + mem = find_memory_block_hinted(section, mem); + if (!mem) + continue; + + ret = func(mem, arg); + if (ret) { + kobject_put(&mem->dev.kobj); + return ret; + } + } + + if (mem) + kobject_put(&mem->dev.kobj); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) +{ + int ret = !is_memblock_offlined(mem); + + if (unlikely(ret)) { + phys_addr_t beginpa, endpa; + + beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + pr_warn("removing memory fails, because memory " + "[%pa-%pa] is onlined\n", + &beginpa, &endpa); + } + + return ret; +} + +static int check_cpu_on_node(pg_data_t *pgdat) { - unsigned long start_pfn, end_pfn; + int cpu; + + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == pgdat->node_id) + /* + * the cpu on this node isn't removed, and we can't + * offline this node. + */ + return -EBUSY; + } - start_pfn = PFN_DOWN(start); - end_pfn = start_pfn + PFN_DOWN(size); - return offline_pages(start_pfn, end_pfn, 120 * HZ); + return 0; } -#else -int remove_memory(u64 start, u64 size) + +static void unmap_cpu_on_node(pg_data_t *pgdat) { - return -EINVAL; +#ifdef CONFIG_ACPI_NUMA + int cpu; + + for_each_possible_cpu(cpu) + if (cpu_to_node(cpu) == pgdat->node_id) + numa_clear_node(cpu); +#endif +} + +static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) +{ + int ret; + + ret = check_cpu_on_node(pgdat); + if (ret) + return ret; + + /* + * the node will be offlined when we come here, so we can clear + * the cpu_to_node() now. + */ + + unmap_cpu_on_node(pgdat); + return 0; +} + +/** + * try_offline_node + * + * Offline a node if all memory sections and cpus of the node are removed. + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call. + */ +void try_offline_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + struct page *pgdat_page = virt_to_page(pgdat); + int i; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!present_section_nr(section_nr)) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* + * some memory sections of this node are not removed, and we + * can't offline node now. + */ + return; + } + + if (check_and_unmap_cpu_on_node(pgdat)) + return; + + /* + * all memory/cpu of this node are removed, we can offline this + * node now. + */ + node_set_offline(nid); + unregister_one_node(nid); + + if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) + /* node data is allocated from boot memory */ + return; + + /* free waittable in each zone */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + /* + * wait_table may be allocated from boot memory, + * here only free if it's allocated by vmalloc. + */ + if (is_vmalloc_addr(zone->wait_table)) + vfree(zone->wait_table); + } + + /* + * Since there is no way to guarentee the address of pgdat/zone is not + * on stack of any kernel threads or used by other kernel objects + * without reference counting or other symchronizing method, do not + * reset node_data and free pgdat here. Just reset it to 0 and reuse + * the memory when the node is online again. + */ + memset(pgdat, 0, sizeof(*pgdat)); +} +EXPORT_SYMBOL(try_offline_node); + +/** + * remove_memory + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ +void __ref remove_memory(int nid, u64 start, u64 size) +{ + int ret; + + BUG_ON(check_hotplug_memory_range(start, size)); + + lock_memory_hotplug(); + + /* + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and trigger a BUG() + * if this is not the case. + */ + ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, + check_memblock_offlined_cb); + if (ret) { + unlock_memory_hotplug(); + BUG(); + } + + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + arch_remove_memory(start, size); + + try_offline_node(nid); + + unlock_memory_hotplug(); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ EXPORT_SYMBOL_GPL(remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b19569137529..30cc47f8ffa0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -26,7 +26,7 @@ * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. - * As a special case node -1 here means do the allocation + * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. @@ -90,6 +90,7 @@ #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -117,6 +118,29 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + + if (!pol) { + int node = numa_node_id(); + + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* + * preferred_node_policy is not initialised early in + * boot + */ + if (!pol->mode) + pol = NULL; + } + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -140,19 +164,7 @@ static const struct mempolicy_operations { /* Check that the nodemask contains at least one populated zone */ static int is_valid_nodemask(const nodemask_t *nodemask) { - int nd, k; - - for_each_node_mask(nd, *nodemask) { - struct zone *z; - - for (k = 0; k <= policy_zone; k++) { - z = &NODE_DATA(nd)->node_zones[k]; - if (z->present_pages > 0) - return 1; - } - } - - return 0; + return nodes_intersects(*nodemask, node_states[N_MEMORY]); } static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -212,9 +224,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; - /* Check N_HIGH_MEMORY */ + /* Check N_MEMORY */ nodes_and(nsc->mask1, - cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); + cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) @@ -249,12 +261,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); + mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -269,6 +281,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -390,7 +406,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == 0 && + if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -460,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -483,9 +502,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. - * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -500,7 +518,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, + pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, + void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE + int nid; + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); + entry = huge_ptep_get((pte_t *)pmd); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, private); +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif +} + +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -511,17 +558,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (!pmd_present(*pmd)) + continue; + if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { + queue_pages_hugetlb_pmd_range(vma, pmd, nodes, + flags, private); + continue; + } + split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes, + if (queue_pages_pte_range(vma, pmd, addr, next, nodes, flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -532,16 +586,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) + continue; if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes, + if (queue_pages_pmd_range(vma, pud, addr, next, nodes, flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -554,20 +610,51 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes, + if (queue_pages_pud_range(vma, pgd, addr, next, nodes, flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; } +#ifdef CONFIG_NUMA_BALANCING /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + int nr_updated; + + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + + return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +/* + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) */ static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { int err; @@ -579,52 +666,75 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; - err = check_pgd_range(vma, start, endvma, nodes, + vma_migratable(vma))) { + + err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { first = ERR_PTR(err); break; } } +next: prev = vma; } return first; } -/* Apply policy to a single VMA */ -static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) { - int err = 0; - struct mempolicy *old = vma->vm_policy; + int err; + struct mempolicy *old; + struct mempolicy *new; pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); - if (!err) { - mpol_get(new); - vma->vm_policy = new; - mpol_put(old); + if (err) + goto err_out; } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); return err; } @@ -664,7 +774,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (prev) { vma = prev; next = vma->vm_next; - continue; + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + /* vma_merge() joined vma && vma->next, case 8 */ + goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); @@ -676,7 +789,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - err = policy_vma(vma, new_pol); + replace: + err = vma_replace_policy(vma, new_pol); if (err) goto out; } @@ -685,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -751,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); @@ -918,7 +1001,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -931,21 +1018,24 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; - struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (IS_ERR(vma)) - return PTR_ERR(vma); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC); + MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; @@ -957,8 +1047,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * * Returns the number of page that could not be moved. */ -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { int busy = 0; int err; @@ -970,7 +1060,7 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from_nodes, to_nodes, flags); + err = migrate_vmas(mm, from, to, flags); if (err) goto out; @@ -1005,14 +1095,34 @@ int do_migrate_pages(struct mm_struct *mm, * moved to an empty node, then there is nothing left worth migrating. */ - tmp = *from_nodes; + tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { - d = node_remap(s, *from_nodes, *to_nodes); + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); if (s == d) continue; @@ -1023,7 +1133,7 @@ int do_migrate_pages(struct mm_struct *mm, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1060,6 +1170,10 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * vma = vma->vm_next; } + if (PageHuge(page)) { + BUG_ON(!vma); + return alloc_huge_page_noerr(vma, address, 1); + } /* * if !vma, alloc_page_vma() will use task or system default policy */ @@ -1072,8 +1186,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, { } -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) { return -ENOSYS; } @@ -1095,8 +1209,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1119,6 +1232,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1128,7 +1244,7 @@ static long do_mbind(unsigned long start, unsigned long len, pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : -1); + nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { @@ -1152,27 +1268,29 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, + vma = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - false, true); + (unsigned long)vma, + MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); up_write(&mm->mmap_sem); mpol_out: @@ -1334,8 +1452,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, * userid as the target process. */ tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && - cred->uid != tcred->suid && cred->uid != tcred->uid && + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; @@ -1350,7 +1468,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { + if (!nodes_subset(*new, node_states[N_MEMORY])) { err = -EINVAL; goto out_put; } @@ -1411,10 +1529,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1441,8 +1559,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1464,9 +1582,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -1498,9 +1616,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies - * are protected by the task's mmap_sem, which must be held for read by - * the caller. + * Current or other task's task mempolicy and non-shared vma policies must be + * protected by task_lock(task) by the caller. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the @@ -1509,7 +1626,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1517,14 +1634,68 @@ struct mempolicy *get_vma_policy(struct task_struct *task, addr); if (vpol) pol = vpol; - } else if (vma->vm_policy) + } else if (vma->vm_policy) { pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } if (!pol) pol = &default_policy; return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + +static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ + enum zone_type dynamic_policy_zone = policy_zone; + + BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + + /* + * if policy->v.nodes has movable memory only, + * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. + * + * policy->v.nodes is intersect with node_states[N_MEMORY]. + * so if the following test faile, it implies + * policy->v.nodes has movable memory only. + */ + if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + dynamic_policy_zone = ZONE_MOVABLE; + + return zone >= dynamic_policy_zone; +} + /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation @@ -1533,7 +1704,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && - gfp_zone(gfp) >= policy_zone && + apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; @@ -1584,15 +1755,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(struct mempolicy *policy) +unsigned int mempolicy_slab_node(void) { + struct mempolicy *policy; + int node = numa_mem_id(); + + if (in_interrupt()) + return node; + + policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1612,11 +1786,11 @@ unsigned slab_node(struct mempolicy *policy) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: @@ -1631,7 +1805,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1668,11 +1842,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) @@ -1695,7 +1869,7 @@ int node_random(const nodemask_t *maskp) * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1854,13 +2028,12 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { struct mempolicy *pol; - struct zonelist *zl; struct page *page; unsigned int cpuset_mems_cookie; retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1868,29 +2041,17 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } - zl = policy_zonelist(gfp, pol, node); - if (unlikely(mpol_needs_cond_ref(pol))) { - /* - * slow path: ref counted shared policy - */ - struct page *page = __alloc_pages_nodemask(gfp, order, - zl, policy_nodemask(gfp, pol)); - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - return page; - } - /* - * fast path: default or task policy - */ - page = __alloc_pages_nodemask(gfp, order, zl, + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol, node), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(mpol_needs_cond_ref(pol))) + __mpol_put(pol); + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -1916,7 +2077,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -1924,7 +2085,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -1937,13 +2098,23 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() @@ -1984,28 +2155,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) return new; } -/* - * If *frompol needs [has] an extra ref, copy *frompol to *tompol , - * eliminate the * MPOL_F_* flags that require conditional ref and - * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly - * after return. Use the returned value. - * - * Allows use of a mempolicy for, e.g., multiple allocations with a single - * policy lookup, even if the policy needs/has extra ref on lookup. - * shmem_readahead needs this. - */ -struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, - struct mempolicy *frompol) -{ - if (!mpol_needs_cond_ref(frompol)) - return frompol; - - *tompol = *frompol; - tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ - __mpol_put(frompol); - return tompol; -} - /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -2116,26 +2265,132 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) return pol; } +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) { + polnid = thisnid; + + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + goto out; + } + + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_free(n); +} + +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { - struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + struct sp_node *n; + struct mempolicy *newpol; + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; - n->start = start; - n->end = end; - mpol_get(pol); - pol->flags |= MPOL_F_SHARED; /* for unref */ - n->policy = pol; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + sp_node_init(n, start, end, newpol); + return n; } @@ -2143,7 +2398,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2 = NULL; + struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; + int ret = 0; restart: spin_lock(&sp->lock); @@ -2159,16 +2417,16 @@ restart: } else { /* Old policy spanning whole new range. */ if (n->end > end) { - if (!new2) { - spin_unlock(&sp->lock); - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) - return -ENOMEM; - goto restart; - } + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, end, n->end, mpol_new); n->end = start; - sp_insert(sp, new2); - new2 = NULL; + sp_insert(sp, n_new); + n_new = NULL; + mpol_new = NULL; break; } else n->end = start; @@ -2180,11 +2438,26 @@ restart: if (new) sp_insert(sp, new); spin_unlock(&sp->lock); - if (new2) { - mpol_put(new2->policy); - kmem_cache_free(sn_cache, new2); - } - return 0; + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; } /** @@ -2247,7 +2520,7 @@ int mpol_set_shared_policy(struct shared_policy *info, vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -2256,7 +2529,7 @@ int mpol_set_shared_policy(struct shared_policy *info, } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) - kmem_cache_free(sn_cache, new); + sp_free(new); return err; } @@ -2273,13 +2546,60 @@ void mpol_free_shared_policy(struct shared_policy *p) while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - rb_erase(&n->nd, &p->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_delete(p, n); } spin_unlock(&p->lock); } +#ifdef CONFIG_NUMA_BALANCING +static int __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + + if (nr_node_ids > 1 && !numabalancing_override) { + pr_info("%s automatic NUMA balancing. " + "Configure with numa_balancing= or the " + "kernel.numa_balancing sysctl", + numabalancing_default ? "Enabling" : "Disabling"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + numabalancing_override = 1; + ret = 1; + } else if (!strcmp(str, "disable")) { + numabalancing_override = -1; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2295,13 +2615,22 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ @@ -2321,6 +2650,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ @@ -2334,44 +2665,34 @@ void numa_default_policy(void) */ /* - * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ -#define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local" + [MPOL_LOCAL] = "local", }; #ifdef CONFIG_TMPFS /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @no_context: flag whether to "contextualize" the mempolicy * * Format of input: * <mode>[=<flags>][:<nodelist>] * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy. This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time. Used to parse tmpfs mpol - * mount option. Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved. Saving - * it again is redundant, but safe. - * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode; - unsigned short uninitialized_var(mode_flags); + unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); @@ -2382,7 +2703,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; - if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) + if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); @@ -2390,12 +2711,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode <= MPOL_LOCAL; mode++) { + for (mode = 0; mode < MPOL_MAX; mode++) { if (!strcmp(str, policy_modes[mode])) { break; } } - if (mode > MPOL_LOCAL) + if (mode >= MPOL_MAX) goto out; switch (mode) { @@ -2416,7 +2737,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) * Default to online nodes with memory if no nodelist */ if (!nodelist) - nodes = node_states[N_HIGH_MEMORY]; + nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* @@ -2459,24 +2780,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } else { - int ret; - NODEMASK_SCRATCH(scratch); - if (scratch) { - task_lock(current); - ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); - } else - ret = -ENOMEM; - NODEMASK_SCRATCH_FREE(scratch); - if (ret) { - mpol_put(new); - goto out; - } - } + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + err = 0; out: @@ -2496,67 +2816,46 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; - - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; /* pseudo-policy */ + mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: - if (no_context) - nodes = pol->w.user_nodemask; - else - nodes = pol->v.nodes; + nodes = pol->v.nodes; break; - default: - BUG(); + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2568,10 +2867,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) } if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; + p += snprintf(p, buffer + maxlen - p, ":"); p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } - return p - buffer; } diff --git a/mm/mempool.c b/mm/mempool.c index d9049811f352..905434f18c97 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, int node_id) + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; pool->elements = kmalloc_node(min_nr * sizeof(void *), - GFP_KERNEL, node_id); + gfp_mask, node_id); if (!pool->elements) { kfree(pool); return NULL; @@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, while (pool->curr_nr < pool->min_nr) { void *element; - element = pool->alloc(GFP_KERNEL, pool->pool_data); + element = pool->alloc(gfp_mask, pool->pool_data); if (unlikely(!element)) { mempool_destroy(pool); return NULL; @@ -302,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (pool->curr_nr < pool->min_nr) { + if (unlikely(pool->curr_nr < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); - if (pool->curr_nr < pool->min_nr) { + if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); diff --git a/mm/migrate.c b/mm/migrate.c index 11072383ae12..bed48809e5d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -33,10 +33,16 @@ #include <linux/memcontrol.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/balloon_compaction.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> +#define CREATE_TRACE_POINTS +#include <trace/events/migrate.h> + #include "internal.h" /* @@ -66,19 +72,30 @@ int migrate_prep_local(void) } /* - * Add isolated pages on the list back to the LRU under page lock - * to avoid leaking evictable pages back onto unevictable list. + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). */ -void putback_lru_pages(struct list_head *l) +void putback_movable_pages(struct list_head *l) { struct page *page; struct page *page2; list_for_each_entry_safe(page, page2, l, lru) { + if (unlikely(PageHuge(page))) { + putback_active_hugepage(page); + continue; + } list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + if (unlikely(isolated_balloon_page(page))) + balloon_page_putback(page); + else + putback_lru_page(page); } } @@ -90,8 +107,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; @@ -100,21 +115,13 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); if (pmd_trans_huge(*pmd)) goto out; - if (!pmd_present(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); @@ -139,13 +146,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(new)) + if (PageHuge(new)) { pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, vma, new, 0); + } #endif - flush_cache_page(vma, addr, pte_pfn(pte)); + flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte); if (PageHuge(new)) { @@ -167,12 +178,49 @@ out: } /* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + +/* * Get rid of all migration entries and replace them by * references to the indicated page. */ static void remove_migration_ptes(struct page *old, struct page *new) { - rmap_walk(new, remove_migration_pte, old); + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, + }; + + rmap_walk(new, &rwc); } /* @@ -180,15 +228,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -216,6 +263,21 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -274,18 +336,19 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ -static int migrate_page_move_mapping(struct address_space *mapping, +int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, enum migrate_mode mode) + struct buffer_head *head, enum migrate_mode mode, + int extra_count) { - int expected_count; + int expected_count = 1 + extra_count; void **pslot; if (!mapping) { /* Anonymous page without mapping */ - if (page_count(page) != 1) + if (page_count(page) != expected_count) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -293,7 +356,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + page_has_private(page); + expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -355,7 +418,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, } spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -371,7 +434,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, if (!mapping) { if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -398,7 +461,55 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; +} + +/* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages; ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +static void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + int nr_pages; + + if (PageHuge(src)) { + /* hugetlbfs page */ + struct hstate *h = page_hstate(src); + nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + } else { + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = hpage_nr_pages(src); + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } } /* @@ -406,7 +517,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page)) + int cpupid; + + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -418,7 +531,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageUptodate(page)) SetPageUptodate(newpage); if (TestClearPageActive(page)) { - VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON_PAGE(PageUnevictable(page), page); SetPageActive(newpage); } else if (TestClearPageUnevictable(page)) SetPageUnevictable(newpage); @@ -436,12 +549,25 @@ void migrate_page_copy(struct page *newpage, struct page *page) * is actually a signal that all of the page has become dirty. * Whereas only part of our page may be dirty. */ - __set_page_dirty_nobuffers(newpage); + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); } + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(page, -1); + page_cpupid_xchg_last(newpage, cpupid); + mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); - + /* + * Please do not reorder this without considering how mm/ksm.c's + * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + */ ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); @@ -458,14 +584,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) * Migration functions ***********************************************************/ -/* Always fail migration. Used for mappings that are not movable */ -int fail_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - /* * Common logic to directly migrate a single page suitable for * pages that do not use PagePrivate/PagePrivate2. @@ -480,13 +598,13 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -507,9 +625,9 @@ int buffer_migrate_page(struct address_space *mapping, head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; /* @@ -545,7 +663,7 @@ int buffer_migrate_page(struct address_space *mapping, } while (bh != head); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(buffer_migrate_page); #endif @@ -624,7 +742,7 @@ static int fallback_migrate_page(struct address_space *mapping, * * Return value: * < 0 - error code - * == 0 - success + * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, int remap_swapcache, enum migrate_mode mode) @@ -661,7 +779,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc) { + if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { if (remap_swapcache) @@ -675,11 +793,10 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; - int charge = 0; struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; @@ -706,31 +823,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* - * Only memory hotplug's offline_pages() caller has locked out KSM, - * and can safely migrate a KSM page. The other cases have skipped - * PageKsm along with PageReserved - but it is only now when we have - * the page lock that we can be certain it will not go KSM beneath us - * (KSM will not upgrade a page from PageAnon to PageKsm when it sees - * its pagecount raised, but only here do we take the page lock which - * serializes that). - */ - if (PageKsm(page) && !offlining) { - rc = -EBUSY; - goto unlock; - } - /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); - if (charge == -ENOMEM) { - rc = -ENOMEM; - goto unlock; - } - BUG_ON(charge); + mem_cgroup_prepare_migration(page, newpage, &mem); if (PageWriteback(page)) { /* - * Only in the case of a full syncronous migration is it + * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much @@ -751,9 +849,9 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ - if (PageAnon(page)) { + if (PageAnon(page) && !PageKsm(page)) { /* - * Only page_lock_anon_vma() understands the subtleties of + * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); @@ -780,6 +878,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } + if (unlikely(balloon_page_movable(page))) { + /* + * A ballooned page does not need any special attention from + * physical to virtual reverse mapping procedures. + * Skip any attempt to unmap PTEs or to remap swap cache, + * in order to avoid burning cycles at rmap level, and perform + * the page migration right away (proteced by page lock). + */ + rc = balloon_page_migrate(newpage, page, mode); + goto uncharge; + } + /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU @@ -793,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * free the metadata, so the page can be freed. */ if (!page->mapping) { - VM_BUG_ON(PageAnon(page)); + VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); goto uncharge; @@ -816,9 +926,9 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - if (!charge) - mem_cgroup_end_migration(mem, page, newpage, rc == 0); -unlock: + mem_cgroup_end_migration(mem, page, newpage, + (rc == MIGRATEPAGE_SUCCESS || + rc == MIGRATEPAGE_BALLOON_SUCCESS)); unlock_page(page); out: return rc; @@ -829,8 +939,7 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, - enum migrate_mode mode) + struct page *page, int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -848,7 +957,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, mode); + rc = __unmap_and_move(page, newpage, force, mode); + + if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { + /* + * A ballooned page has been migrated already. + * Now, it's the time to wrap-up counters, + * handle the page back to Buddy and return. + */ + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + balloon_page_free(page); + return MIGRATEPAGE_SUCCESS; + } out: if (rc != -EAGAIN) { /* @@ -896,14 +1017,26 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, - enum migrate_mode mode) + int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; - struct page *new_hpage = get_new_page(hpage, private, &result); + struct page *new_hpage; struct anon_vma *anon_vma = NULL; + /* + * Movability of hugepages depends on architectures and hugepage size. + * This check is necessary because some callers of hugepage migration + * like soft offline and memory hotremove don't walk through page + * tables or check whether the hugepage is pmd-based or not before + * kicking migration. + */ + if (!hugepage_migration_support(page_hstate(hpage))) { + putback_active_hugepage(hpage); + return -ENOSYS; + } + + new_hpage = get_new_page(hpage, private, &result); if (!new_hpage) return -ENOMEM; @@ -928,16 +1061,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); - unlock_page(hpage); -out: - if (rc != -EAGAIN) { - list_del(&hpage->lru); - put_page(hpage); - } + if (!rc) + hugetlb_cgroup_migrate(hpage, new_hpage); + unlock_page(hpage); +out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); put_page(new_hpage); - if (result) { if (rc) *result = rc; @@ -948,26 +1080,30 @@ out: } /* - * migrate_pages + * migrate_pages - migrate the pages specified in a list, to the free pages + * supplied as the target for the page migration * - * The function takes one list of pages to migrate and a function - * that determines from the page to be migrated and the private data - * the target of the move and allocates the page. + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. + * @reason: The reason for page migration. * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * Caller should call putback_lru_pages to return pages to the LRU + * The function returns after 10 attempts or if no pages are movable any more + * because the list has become empty or no retryable pages exist any more. + * The caller should call putback_lru_pages() to return pages to the LRU * or free list only if ret != 0. * - * Return: Number of pages not migrated or error code. + * Returns the number of pages that were not migrated, or an error code. */ -int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode) +int migrate_pages(struct list_head *from, new_page_t get_new_page, + unsigned long private, enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; + int nr_succeeded = 0; int pass = 0; struct page *page; struct page *page2; @@ -983,9 +1119,12 @@ int migrate_pages(struct list_head *from, list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); - rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining, - mode); + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, mode); + else + rc = unmap_and_move(get_new_page, private, + page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -993,68 +1132,33 @@ int migrate_pages(struct list_head *from, case -EAGAIN: retry++; break; - case 0: + case MIGRATEPAGE_SUCCESS: + nr_succeeded++; break; default: - /* Permanent failure */ + /* + * Permanent failure (-EBUSY, -ENOSYS, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ nr_failed++; break; } } } - rc = 0; + rc = nr_failed + retry; out: + if (nr_succeeded) + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + if (nr_failed) + count_vm_events(PGMIGRATE_FAIL, nr_failed); + trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - if (rc) - return rc; - - return nr_failed + retry; -} - -int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode) -{ - int retry = 1; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int rc; - - for (pass = 0; pass < 10 && retry; pass++) { - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - cond_resched(); - - rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining, - mode); - - switch(rc) { - case -ENOMEM: - goto out; - case -EAGAIN: - retry++; - break; - case 0: - break; - default: - /* Permanent failure */ - nr_failed++; - break; - } - } - } - rc = 0; -out: - if (rc) - return rc; - - return nr_failed + retry; + return rc; } #ifdef CONFIG_NUMA @@ -1081,8 +1185,12 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + pm->node); + else + return alloc_pages_exact_node(pm->node, + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1124,7 +1232,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* Use PageReserved to check for zero page */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) goto put_and_set; pp->page = page; @@ -1141,6 +1249,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, !migrate_all) goto put_and_set; + if (PageHuge(page)) { + isolate_huge_page(page, &pagelist); + goto put_and_set; + } + err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); @@ -1161,9 +1274,9 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC); + (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } up_read(&mm->mmap_sem); @@ -1223,7 +1336,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, if (node < 0 || node >= MAX_NUMNODES) goto out_pm; - if (!node_state(node, N_HIGH_MEMORY)) + if (!node_state(node, N_MEMORY)) goto out_pm; err = -EACCES; @@ -1285,7 +1398,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page) || PageKsm(page)) + if (!page || PageReserved(page)) goto set_status; err = page_to_nid(page); @@ -1371,8 +1484,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, * userid as the target process. */ tcred = __task_cred(task); - if (cred->euid != tcred->suid && cred->euid != tcred->uid && - cred->uid != tcred->suid && cred->uid != tcred->uid && + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; @@ -1425,4 +1538,374 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + unsigned long nr_migrate_pages) +{ + int z; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (!zone_reclaimable(zone)) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + 0, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data, + int **result) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_exact_node(nid, + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & + ~GFP_IOFS, 0); + + return newpage; +} + +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + + msecs_to_jiffies(pteupdate_interval_millisecs))) + return false; + + if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) + return false; + + return true; +} + +/* Returns true if the node is migrate rate-limited after the update */ +static bool numamigrate_update_ratelimit(pg_data_t *pgdat, + unsigned long nr_pages) +{ + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies + + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); + } + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); + return true; + } + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; +} + +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int page_lru; + + VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + + /* Avoid migrating to a node that is nearly full */ + if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + return 0; + + if (isolate_lru_page(page)) + return 0; + + /* + * migrate_misplaced_transhuge_page() skips page migration's usual + * check on page_count(), so we must do it here, now that the page + * has been isolated: a GUP pin, or any other pin, prevents migration. + * The expected page count is 3: 1 for page's mapcount and 1 for the + * caller's pin and 1 for the reference taken by isolate_lru_page(). + */ + if (PageTransHuge(page) && page_count(page) != 3) { + putback_lru_page(page); + return 0; + } + + page_lru = page_is_file_cache(page); + mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + hpage_nr_pages(page)); + + /* + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. + */ + put_page(page); + return 1; +} + +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + wait_on_page_locked(page); +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. + */ + if (page_mapcount(page) != 1 && page_is_file_cache(page) && + (vma->vm_flags & VM_EXEC)) + goto out; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, 1)) + goto out; + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + if (nr_remaining) { + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); + BUG_ON(!list_empty(&migratepages)); + return isolated; + +out: + put_page(page); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * Migrates a THP to a given target node. page must be locked and is unlocked + * before returning. + */ +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + spinlock_t *ptl; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); + unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + pmd_t orig_entry; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); + if (!new_page) + goto out_fail; + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + put_page(new_page); + goto out_fail; + } + + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, mmun_start, mmun_end); + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { +fail_putback: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + /* Retake the callers reference and putback on LRU */ + get_page(page); + putback_lru_page(page); + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + + goto out_unlock; + } + + /* + * Traditional migration needs to prepare the memcg charge + * transaction early to prevent the old page from being + * uncharged when installing migration entries. Here we can + * save the potential rollback and start the charge transfer + * only when migration is already known to end successfully. + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + + orig_entry = *pmd; + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + /* + * Clear the old entry under pagetable lock and establish the new PTE. + * Any parallel GUP will either observe the old page blocking on the + * page lock, block on the page table lock or observe the new page. + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ + flush_cache_range(vma, mmun_start, mmun_end); + page_add_new_anon_rmap(new_page, vma, mmun_start); + pmdp_clear_flush(vma, mmun_start, pmd); + set_pmd_at(mm, mmun_start, pmd, entry); + flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { + set_pmd_at(mm, mmun_start, pmd, orig_entry); + flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; + } + + page_remove_rmap(page); + + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge + * before it's fully transferred to the new page. + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +out_dropref: + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, entry)) { + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, mmun_start, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(ptl); + +out_unlock: + unlock_page(page); + put_page(page); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..725c80961048 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); @@ -135,7 +143,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), + pgoff); #else WARN_ON(1); *vec = 1; @@ -224,13 +233,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); - if (is_vm_hugetlb_page(vma)) { - mincore_hugetlb_page_range(vma, addr, end, vec); - return (end - addr) >> PAGE_SHIFT; - } - - end = pmd_addr_end(addr, end); - if (is_vm_hugetlb_page(vma)) mincore_hugetlb_page_range(vma, addr, end, vec); else diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..b1eb53634005 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -11,6 +11,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> @@ -18,6 +19,8 @@ #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> +#include <linux/memcontrol.h> +#include <linux/mm_inline.h> #include "internal.h" @@ -51,15 +54,13 @@ EXPORT_SYMBOL(can_do_mlock); /* * LRU accounting for clear_page_mlock() */ -void __clear_page_mlock(struct page *page) +void clear_page_mlock(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - - if (!page->mapping) { /* truncated ? */ + if (!TestClearPageMlocked(page)) return; - } - dec_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGCLEARED); if (!isolate_lru_page(page)) { putback_lru_page(page); @@ -78,19 +79,85 @@ void __clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { - inc_zone_page_state(page, NR_MLOCK); + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); if (!isolate_lru_page(page)) putback_lru_page(page); } } +/* + * Isolate a page from LRU with optional get_page() pin. + * Assumes lru_lock already held and page already pinned. + */ +static bool __munlock_isolate_lru_page(struct page *page, bool getpage) +{ + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + if (getpage) + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + return true; + } + + return false; +} + +/* + * Finish munlock after successful page isolation + * + * Page must be locked. This is a wrapper for try_to_munlock() + * and putback_lru_page() with munlock accounting. + */ +static void __munlock_isolated_page(struct page *page) +{ + int ret = SWAP_AGAIN; + + /* + * Optimization: if the page was mapped just once, that's our mapping + * and we don't need to check all the other vmas. + */ + if (page_mapcount(page) > 1) + ret = try_to_munlock(page); + + /* Did try_to_unlock() succeed or punt? */ + if (ret != SWAP_MLOCK) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); +} + +/* + * Accounting for page isolation fail during munlock + * + * Performs accounting when page isolation fails in munlock. There is nothing + * else to do because it means some other task has already removed the page + * from the LRU. putback_lru_page() will take care of removing the page from + * the unevictable list, if necessary. vmscan [page_referenced()] will move + * the page back to the unevictable list if some other vma has it mlocked. + */ +static void __munlock_isolation_failed(struct page *page) +{ + if (PageUnevictable(page)) + __count_vm_event(UNEVICTABLE_PGSTRANDED); + else + __count_vm_event(UNEVICTABLE_PGMUNLOCKED); +} + /** * munlock_vma_page - munlock a vma page - * @page - page to be unlocked + * @page - page to be unlocked, either a normal page or THP page head + * + * returns the size of the page as a page mask (0 for normal page, + * HPAGE_PMD_NR - 1 for THP head page) * * called from munlock()/munmap() path with page supposedly on the LRU. * When we munlock a page, because the vma where we found the page is being @@ -103,44 +170,39 @@ void mlock_vma_page(struct page *page) * can't isolate the page, we leave it for putback_lru_page() and vmscan * [page_referenced()/try_to_unmap()] to deal with. */ -void munlock_vma_page(struct page *page) +unsigned int munlock_vma_page(struct page *page) { + unsigned int nr_pages; + struct zone *zone = page_zone(page); + + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); - if (TestClearPageMlocked(page)) { - dec_zone_page_state(page, NR_MLOCK); - if (!isolate_lru_page(page)) { - int ret = SWAP_AGAIN; + /* + * Serialize with any parallel __split_huge_page_refcount() which + * might otherwise copy PageMlocked to part of the tail pages before + * we clear it in the head page. It also stabilizes hpage_nr_pages(). + */ + spin_lock_irq(&zone->lru_lock); - /* - * Optimization: if the page was mapped just once, - * that's our mapping and we don't need to check all the - * other vmas. - */ - if (page_mapcount(page) > 1) - ret = try_to_munlock(page); - /* - * did try_to_unlock() succeed or punt? - */ - if (ret != SWAP_MLOCK) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + nr_pages = hpage_nr_pages(page); + if (!TestClearPageMlocked(page)) + goto unlock_out; - putback_lru_page(page); - } else { - /* - * Some other task has removed the page from the LRU. - * putback_lru_page() will take care of removing the - * page from the unevictable list, if necessary. - * vmscan [page_referenced()] will move the page back - * to the unevictable list if some other vma has it - * mlocked. - */ - if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); - else - count_vm_event(UNEVICTABLE_PGMUNLOCKED); - } + __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + + if (__munlock_isolate_lru_page(page, true)) { + spin_unlock_irq(&zone->lru_lock); + __munlock_isolated_page(page); + goto out; } + __munlock_isolation_failed(page); + +unlock_out: + spin_unlock_irq(&zone->lru_lock); + +out: + return nr_pages - 1; } /** @@ -155,13 +217,11 @@ void munlock_vma_page(struct page *page) * * vma->vm_mm->mmap_sem must be held for at least read. */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int *nonblocking) +long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start; - int nr_pages = (end - start) / PAGE_SIZE; + unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -186,7 +246,11 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE; - return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, NULL, NULL, nonblocking); } @@ -202,54 +266,188 @@ static int __mlock_posix_error_return(long retval) return retval; } -/** - * mlock_vma_pages_range() - mlock pages in specified vma range. - * @vma - the vma containing the specfied address range - * @start - starting address in @vma to mlock - * @end - end address [+1] in @vma to mlock - * - * For mmap()/mremap()/expansion of mlocked vma. +/* + * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() * - * return 0 on success for "normal" vmas. + * The fast path is available only for evictable pages with single mapping. + * Then we can bypass the per-cpu pvec and get better performance. + * when mapcount > 1 we need try_to_munlock() which can fail. + * when !page_evictable(), we need the full redo logic of putback_lru_page to + * avoid leaving evictable page in unevictable list. * - * return number of pages [> 0] to be removed from locked_vm on success - * of "special" vmas. + * In case of success, @page is added to @pvec and @pgrescued is incremented + * in case that the page was previously unevictable. @page is also unlocked. */ -long mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, + int *pgrescued) { - int nr_pages = (end - start) / PAGE_SIZE; - BUG_ON(!(vma->vm_flags & VM_LOCKED)); + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (page_mapcount(page) <= 1 && page_evictable(page)) { + pagevec_add(pvec, page); + if (TestClearPageUnevictable(page)) + (*pgrescued)++; + unlock_page(page); + return true; + } + + return false; +} +/* + * Putback multiple evictable pages to the LRU + * + * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of + * the pages might have meanwhile become unevictable but that is OK. + */ +static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +{ + count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); /* - * filter unlockable vmas + *__pagevec_lru_add() calls release_pages() so we don't call + * put_page() explicitly */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - goto no_mlock; + __pagevec_lru_add(pvec); + count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); +} + +/* + * Munlock a batch of pages from the same zone + * + * The work is split to two main phases. First phase clears the Mlocked flag + * and attempts to isolate the pages, all under a single zone lru lock. + * The second phase finishes the munlock only for pages where isolation + * succeeded. + * + * Note that the pagevec may be modified during the process. + */ +static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +{ + int i; + int nr = pagevec_count(pvec); + int delta_munlocked; + struct pagevec pvec_putback; + int pgrescued = 0; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm))) { + pagevec_init(&pvec_putback, 0); - __mlock_vma_pages_range(vma, start, end, NULL); + /* Phase 1: page isolation */ + spin_lock_irq(&zone->lru_lock); + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; - /* Hide errors from mmap() and other callers */ - return 0; + if (TestClearPageMlocked(page)) { + /* + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. + */ + if (__munlock_isolate_lru_page(page, false)) + continue; + else + __munlock_isolation_failed(page); + } + + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. We cannot do it under lru_lock however. If it's + * the last pin, __page_cache_release() would deadlock. + */ + pagevec_add(&pvec_putback, pvec->pages[i]); + pvec->pages[i] = NULL; + } + delta_munlocked = -nr + pagevec_count(&pvec_putback); + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + spin_unlock_irq(&zone->lru_lock); + + /* Now we can release pins of pages that we are not munlocking */ + pagevec_release(&pvec_putback); + + /* Phase 2: page munlock */ + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (page) { + lock_page(page); + if (!__putback_lru_fast_prepare(page, &pvec_putback, + &pgrescued)) { + /* + * Slow path. We don't want to lose the last + * pin before unlock_page() + */ + get_page(page); /* for putback_lru_page() */ + __munlock_isolated_page(page); + unlock_page(page); + put_page(page); /* from follow_page_mask() */ + } + } } /* - * User mapped kernel pages or huge pages: - * make these pages present to populate the ptes, but - * fall thru' to reset VM_LOCKED--no need to unlock, and - * return nr_pages so these don't get counted against task's - * locked limit. huge pages are already counted against - * locked vm limit. + * Phase 3: page putback for pages that qualified for the fast path + * This will also call put_page() to return pin from follow_page_mask() */ - make_pages_present(start, end); + if (pagevec_count(&pvec_putback)) + __putback_lru_fast(&pvec_putback, pgrescued); +} -no_mlock: - vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ - return nr_pages; /* error or pages NOT mlocked */ +/* + * Fill up pagevec for __munlock_pagevec using pte walk + * + * The function expects that the struct page corresponding to @start address is + * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. + * + * The rest of @pvec is filled by subsequent pages within the same pmd and same + * zone, as long as the pte's are present and vm_normal_page() succeeds. These + * pages also get pinned. + * + * Returns the address of the next page that should be scanned. This equals + * @start + PAGE_SIZE when no page could be added by the pte walk. + */ +static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, + struct vm_area_struct *vma, int zoneid, unsigned long start, + unsigned long end) +{ + pte_t *pte; + spinlock_t *ptl; + + /* + * Initialize pte walk starting at the already pinned page where we + * are sure that there is a pte, as it was pinned under the same + * mmap_sem write op. + */ + pte = get_locked_pte(vma->vm_mm, start, &ptl); + /* Make sure we do not cross the page table boundary */ + end = pgd_addr_end(start, end); + end = pud_addr_end(start, end); + end = pmd_addr_end(start, end); + + /* The page next to the pinned page is the first we will try to get */ + start += PAGE_SIZE; + while (start < end) { + struct page *page = NULL; + pte++; + if (pte_present(*pte)) + page = vm_normal_page(vma, start, *pte); + /* + * Break if page could not be obtained or the page's node+zone does not + * match + */ + if (!page || page_zone_id(page) != zoneid) + break; + + get_page(page); + /* + * Increase the address that will be returned *before* the + * eventual break due to pvec becoming full by adding the page + */ + start += PAGE_SIZE; + if (pagevec_add(pvec, page) == 0) + break; + } + pte_unmap_unlock(pte, ptl); + return start; } /* @@ -273,13 +471,17 @@ no_mlock: void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - unsigned long addr; - - lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - for (addr = start; addr < end; addr += PAGE_SIZE) { - struct page *page; + while (start < end) { + struct page *page = NULL; + unsigned int page_mask; + unsigned long page_increm; + struct pagevec pvec; + struct zone *zone; + int zoneid; + + pagevec_init(&pvec, 0); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -287,20 +489,48 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * suits munlock very well (and if somehow an abnormal page * has sneaked into the range, we won't oops here: great). */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, + &page_mask); + if (page && !IS_ERR(page)) { - lock_page(page); - /* - * Like in __mlock_vma_pages_range(), - * because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - munlock_vma_page(page); - unlock_page(page); - put_page(page); + if (PageTransHuge(page)) { + lock_page(page); + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching + * munlock_vma_page(), so we need to recompute + * the page_mask here. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); + put_page(page); /* follow_page_mask() */ + } else { + /* + * Non-huge pages are handled in batches via + * pagevec. The pin from follow_page_mask() + * prevents them from collapsing by THP. + */ + pagevec_add(&pvec, page); + zone = page_zone(page); + zoneid = page_zone_id(page); + + /* + * Try to fill the rest of pagevec using fast + * pte walk. This will also update start to + * the next page to process. Then munlock the + * pagevec. + */ + start = __munlock_pagevec_fill(&pvec, vma, + zoneid, start, end); + __munlock_pagevec(&pvec, zone); + goto next; + } } + /* It's a bug to munlock in the middle of a THP page */ + VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + page_mask; + start += page_increm * PAGE_SIZE; +next: cond_resched(); } } @@ -310,7 +540,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and - * populate the ptes via make_pages_present(). + * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ @@ -398,9 +628,9 @@ static int do_mlock(unsigned long start, size_t len, int on) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vma->vm_flags | VM_LOCKED; - if (!on) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (on) + newflags |= VM_LOCKED; tmp = vma->vm_end; if (tmp > end) @@ -423,13 +653,20 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; - int ret = 0; + long ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -490,22 +727,24 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; - locked = len >> PAGE_SHIFT; - locked += current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); + up_write(¤t->mm->mmap_sem); if (!error) - error = do_mlock_pages(start, len, 0); + error = __mm_populate(start, len, 0); return error; } @@ -513,34 +752,37 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); ret = do_mlock(start, len, 0); up_write(¤t->mm->mmap_sem); + return ret; } static int do_mlockall(int flags) { struct vm_area_struct * vma, * prev = NULL; - unsigned int def_flags = 0; if (flags & MCL_FUTURE) - def_flags = VM_LOCKED; - current->mm->def_flags = def_flags; + current->mm->def_flags |= VM_LOCKED; + else + current->mm->def_flags &= ~VM_LOCKED; if (flags == MCL_FUTURE) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags | VM_LOCKED; - if (!(flags & MCL_CURRENT)) - newflags &= ~VM_LOCKED; + newflags = vma->vm_flags & ~VM_LOCKED; + if (flags & MCL_CURRENT) + newflags |= VM_LOCKED; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + cond_resched(); } out: return 0; @@ -561,20 +803,18 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); - if (!ret && (flags & MCL_CURRENT)) { - /* Ignore errors */ - do_mlock_pages(0, TASK_SIZE, 1); - } + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); out: return ret; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 1ffd97ae26d7..4074caf9936b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -9,6 +9,8 @@ #include <linux/init.h> #include <linux/kobject.h> #include <linux/export.h> +#include <linux/memory.h> +#include <linux/notifier.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -69,34 +71,41 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, + LAST_CPUPID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d\n", + "Section %d Node %d Zone %d Lastcpupid %d\n", SECTIONS_SHIFT, NODES_SHIFT, - ZONES_SHIFT); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", - "Section %lu Node %lu Zone %lu\n", + ZONES_SHIFT, + LAST_CPUPID_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", + "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, - (unsigned long)ZONES_PGSHIFT); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", - "Zone ID: %lu -> %lu\n", - (unsigned long)ZONEID_PGOFF, - (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); + (unsigned long)ZONES_PGSHIFT, + (unsigned long)LAST_CPUPID_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", + "Node/Zone ID: %lu -> %lu\n", + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), + (unsigned long)ZONEID_PGOFF); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", - "location: %d -> %d unused %d -> %d flags %d -> %d\n", + "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); #ifdef NODE_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Last cpupid not in page flags"); +#endif if (SECTIONS_WIDTH) { shift -= SECTIONS_WIDTH; @@ -140,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); struct kobject *mm_kobj; EXPORT_SYMBOL_GPL(mm_kobj); +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +static void __meminit mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + static int __init mm_sysfs_init(void) { mm_kobj = kobject_create_and_add("mm", kernel_kobj); @@ -148,5 +202,4 @@ static int __init mm_sysfs_init(void) return 0; } - -__initcall(mm_sysfs_init); +postcore_initcall(mm_sysfs_init); diff --git a/mm/mmap.c b/mm/mmap.c index 848ef52d9603..b1202cf81f4b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,9 +6,11 @@ * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -30,6 +32,11 @@ #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> +#include <linux/uprobes.h> +#include <linux/rbtree_augmented.h> +#include <linux/sched/sysctl.h> +#include <linux/notifier.h> +#include <linux/memory.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -50,12 +57,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -86,7 +87,10 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -94,6 +98,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -111,7 +129,7 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -133,7 +151,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the @@ -152,10 +170,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -163,19 +181,20 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; - allowed += total_swap_pages; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -192,20 +211,20 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_path.dentry->d_inode->i_writecount); + atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -230,11 +249,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } mpol_put(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); return next; @@ -248,6 +264,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; + bool populate; down_write(&mm->mmap_sem); @@ -297,75 +314,217 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Ok, looks good - let it rip. */ if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; + set_brk: mm->brk = brk; + populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + up_write(&mm->mmap_sem); + if (populate) + mm_populate(oldbrk, newbrk - oldbrk); + return brk; + out: retval = mm->brk; up_write(&mm->mmap_sem); return retval; } -#ifdef DEBUG_MM_RB +static long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max, subtree_gap; + max = vma->vm_start; + if (vma->vm_prev) + max -= vma->vm_prev->vm_end; + if (vma->vm_rb.rb_left) { + subtree_gap = rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + if (vma->vm_rb.rb_right) { + subtree_gap = rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + return max; +} + +#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { - int i = 0, j; + int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) - printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; - if (vma->vm_start < pend) + if (vma->vm_start < prev) { + printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + bug = 1; + } + if (vma->vm_start < pend) { printk("vm_start %lx pend %lx\n", vma->vm_start, pend); - if (vma->vm_start > vma->vm_end) - printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->vm_start > vma->vm_end) { + printk("vm_end %lx < vm_start %lx\n", + vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { + printk("free gap %lx, correct %lx\n", + vma->rb_subtree_gap, + vma_compute_subtree_gap(vma)); + bug = 1; + } i++; pn = nd; prev = vma->vm_start; pend = vma->vm_end; } j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) { + for (nd = pn; nd; nd = rb_prev(nd)) j++; + if (i != j) { + printk("backwards %d, forwards %d\n", j, i); + bug = 1; + } + return bug ? -1 : i; +} + +static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) +{ + struct rb_node *nd; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + BUG_ON(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); } - if (i != j) - printk("backwards %d, forwards %d\n", j, i), i = 0; - return i; } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; + unsigned long highest_address = 0; + struct vm_area_struct *vma = mm->mmap; + while (vma) { + struct anon_vma_chain *avc; + vma_lock_anon_vma(vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma_unlock_anon_vma(vma); + highest_address = vma->vm_end; + vma = vma->vm_next; i++; } - if (i != mm->map_count) - printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d vm_next %d\n", mm->map_count, i); + bug = 1; + } + if (highest_address != mm->highest_vm_end) { + printk("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); + bug = 1; + } i = browse_rb(&mm->mm_rb); - if (i != mm->map_count) - printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d rb %d\n", mm->map_count, i); + bug = 1; + } BUG_ON(bug); } #else +#define validate_mm_rb(root, ignore) do { } while (0) #define validate_mm(mm) do { } while (0) #endif -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) +RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_subtree_gap) + +/* + * Update augmented rbtree rb_subtree_gap values after vma->vm_start or + * vma->vm_prev->vm_end values changed, without modifying the vma's position + * in the rbtree. + */ +static void vma_gap_update(struct vm_area_struct *vma) { - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + /* + * As it turns out, RB_DECLARE_CALLBACKS() already created a callback + * function that does exacltly what we want. + */ + vma_gap_callbacks_propagate(&vma->vm_rb, NULL); +} + +static inline void vma_rb_insert(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* All rb_subtree_gap values must be consistent prior to insertion */ + validate_mm_rb(root, NULL); + + rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + /* + * Note rb_erase_augmented is a fairly large inline function, + * so make sure we instantiate it only once with our desired + * augmented rbtree callbacks. + */ + rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +static int find_vma_links(struct mm_struct *mm, unsigned long addr, + unsigned long end, struct vm_area_struct **pprev, + struct rb_node ***rb_link, struct rb_node **rb_parent) +{ + struct rb_node **__rb_link, *__rb_parent, *rb_prev; __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; - vma = NULL; while (*__rb_link) { struct vm_area_struct *vma_tmp; @@ -374,9 +533,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; + /* Fail if an existing vma overlaps the area */ + if (vma_tmp->vm_start < end) + return -ENOMEM; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; @@ -389,14 +548,59 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); *rb_link = __rb_link; *rb_parent = __rb_parent; - return vma; + return 0; +} + +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + unsigned long nr_pages = 0; + struct vm_area_struct *vma; + + /* Find first overlaping mapping */ + vma = find_vma_intersection(mm, addr, end); + if (!vma) + return 0; + + nr_pages = (min(end, vma->vm_end) - + max(addr, vma->vm_start)) >> PAGE_SHIFT; + + /* Iterate over the rest of the overlaps */ + for (vma = vma->vm_next; vma; vma = vma->vm_next) { + unsigned long overlap_len; + + if (vma->vm_start > end) + break; + + overlap_len = min(end, vma->vm_end) - vma->vm_start; + nr_pages += overlap_len >> PAGE_SHIFT; + } + + return nr_pages; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { + /* Update tracking information for the gap following the new vma. */ + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + mm->highest_vm_end = vma->vm_end; + + /* + * vma->vm_prev wasn't known when we followed the rbtree to find the + * correct insertion point for that vma. As a result, we could not + * update the vma vm_rb parents rb_subtree_gap values on the way down. + * So, we first insert the vma with a zero rb_subtree_gap value + * (to be consistent with what we did on the way down), and then + * immediately update the gap to the correct value. Finally we + * rebalance the rbtree after all augmented values have been set. + */ rb_link_node(&vma->vm_rb, rb_parent, rb_link); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); + vma->rb_subtree_gap = 0; + vma_gap_update(vma); + vma_rb_insert(vma, &mm->mm_rb); } static void __vma_link_file(struct vm_area_struct *vma) @@ -408,7 +612,7 @@ static void __vma_link_file(struct vm_area_struct *vma) struct address_space *mapping = file->f_mapping; if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_path.dentry->d_inode->i_writecount); + atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable++; @@ -416,7 +620,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -454,15 +658,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *__vma, *prev; + struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - BUG_ON(__vma && __vma->vm_start < vma->vm_end); + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + BUG(); __vma_link(mm, vma, prev, rb_link, rb_parent); mm->map_count++; } @@ -471,14 +676,15 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next; - prev->vm_next = next; + vma_rb_erase(vma, &mm->mm_rb); + prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -495,9 +701,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; + bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; @@ -546,12 +753,19 @@ again: remove_next = 1 + (end > next->vm_end); if (file) { mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) + if (!(vma->vm_flags & VM_NONLINEAR)) { root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + + if (adjust_next) + uprobe_munmap(next, next->vm_start, + next->vm_end); + } + mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -562,26 +776,33 @@ again: remove_next = 1 + (end > next->vm_end); vma_adjust_trans_huge(vma, start, end, adjust_next); - /* - * When changing only vma->vm_end, we don't really need anon_vma - * lock. This is a fairly rare case by itself, but the anon_vma - * lock may be shared between many sibling processes. Skipping - * the lock for brk adjustments makes a difference sometimes. - */ - if (vma->anon_vma && (importer || start != vma->vm_start)) { - anon_vma = vma->anon_vma; - anon_vma_lock(anon_vma); + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { + VM_BUG_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); } if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } - vma->vm_start = start; - vma->vm_end = end; + if (start != vma->vm_start) { + vma->vm_start = start; + start_changed = true; + } + if (end != vma->vm_end) { + vma->vm_end = end; + end_changed = true; + } vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; @@ -590,8 +811,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } @@ -610,18 +831,37 @@ again: remove_next = 1 + (end > next->vm_end); * (it may either follow vma or precede it). */ __insert_vm_struct(mm, insert); + } else { + if (start_changed) + vma_gap_update(vma); + if (end_changed) { + if (!next) + mm->highest_vm_end = end; + else if (!adjust_next) + vma_gap_update(next); + } } - if (anon_vma) - anon_vma_unlock(anon_vma); + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); + anon_vma_unlock_write(anon_vma); + } if (mapping) mutex_unlock(&mapping->i_mmap_mutex); + if (root) { + uprobe_mmap(vma); + + if (adjust_next) + uprobe_mmap(next); + } + if (remove_next) { if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); - if (next->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); } if (next->anon_vma) anon_vma_merge(vma, next); @@ -633,11 +873,16 @@ again: remove_next = 1 + (end > next->vm_end); * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - if (remove_next == 2) { - next = vma->vm_next; + next = vma->vm_next; + if (remove_next == 2) goto again; - } + else if (next) + vma_gap_update(next); + else + mm->highest_vm_end = end; } + if (insert && file) + uprobe_mmap(insert); validate_mm(mm); @@ -651,8 +896,15 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ - if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressue on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) return 0; if (vma->vm_file != file) return 0; @@ -712,7 +964,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } @@ -841,7 +1093,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -925,14 +1177,14 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, const unsigned long stack_flags = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); + mm->total_vm += pages; + if (file) { mm->shared_vm += pages; if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ @@ -949,19 +1201,37 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ -static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) + unsigned long flags, unsigned long pgoff, + unsigned long *populate) { struct mm_struct * mm = current->mm; - struct inode *inode; vm_flags_t vm_flags; - int error; - unsigned long reqprot = prot; + + *populate = 0; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1010,20 +1280,12 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!can_do_mlock()) return -EPERM; - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - - inode = file ? file->f_path.dentry->d_inode : NULL; + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; if (file) { + struct inode *inode = file_inode(file); + switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) @@ -1039,7 +1301,7 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* * Make sure there are no mandatory locks on the file. */ - if (locks_verify_locked(inode)) + if (locks_verify_locked(file)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -1056,8 +1318,10 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; break; default: @@ -1066,6 +1330,8 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; /* * Ignore pgoff. */ @@ -1083,38 +1349,27 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - error = security_file_mmap(file, reqprot, prot, flags, addr, 0); - if (error) - return error; - - return mmap_region(file, addr, len, flags, vm_flags, pgoff); -} - -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; + addr = mmap_region(file, addr, len, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = len; + return addr; } -EXPORT_SYMBOL(vm_mmap); SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, @@ -1125,32 +1380,41 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; + struct hstate *hs; + + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, - VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, + VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: @@ -1205,7 +1469,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) return 0; /* Specialty mapping? */ - if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) + if (vm_flags & VM_PFNMAP) return 0; /* Can the mapping track the dirty pages? */ @@ -1230,45 +1494,40 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - int correct_wcount = 0; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; + + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + if (!(vm_flags & MAP_FIXED)) + return -ENOMEM; + + nr_pages = count_vma_pages_range(mm, addr, addr + len); + + if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } /* Clear old maps */ error = -ENOMEM; munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; } - /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. - */ - if ((flags & MAP_NORESERVE)) { - /* We honor MAP_NORESERVE if allowed to overcommit */ - if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - - /* hugetlb applies strict overcommit unless MAP_NORESERVE */ - if (file && is_file_hugepages(file)) - vm_flags |= VM_NORESERVE; - } - /* * Private writable mapping: check memory availability */ @@ -1305,36 +1564,29 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); - error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ - if (file) { - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) goto free_vma; - correct_wcount = 1; } - vma->vm_file = file; - get_file(file); + vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if (vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); /* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM + * Bug: If addr is changed, prev, rb_link, rb_parent should + * be updated for vma_link() */ + WARN_ON_ONCE(addr != vma->vm_start); + addr = vma->vm_start; - pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { - if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) - goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; @@ -1356,26 +1608,39 @@ munmap_back: } vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + file = vma->vm_file; out: perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); - } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) - make_pages_present(addr, addr + len); + else + vma->vm_flags &= ~VM_LOCKED; + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + return addr; unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); vma->vm_file = NULL; fput(file); @@ -1390,6 +1655,206 @@ unacct_error: return error; } +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + /* + * We implement the search by looking for an rbtree node that + * immediately follows a suitable gap. That is, + * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; + * - gap_end = vma->vm_start >= info->low_limit + length; + * - gap_end - gap_start >= length + */ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* Adjust search limits by the desired length */ + if (info->high_limit < length) + return -ENOMEM; + high_limit = info->high_limit - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + goto check_highest; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + goto check_highest; + + while (true) { + /* Visit left subtree if it looks promising */ + gap_end = vma->vm_start; + if (gap_end >= low_limit && vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +check_current: + /* Check if current node has a suitable gap */ + if (gap_start > high_limit) + return -ENOMEM; + if (gap_end >= low_limit && gap_end - gap_start >= length) + goto found; + + /* Visit right subtree if it looks promising */ + if (vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + goto check_highest; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_left) { + gap_start = vma->vm_prev->vm_end; + gap_end = vma->vm_start; + goto check_current; + } + } + } + +check_highest: + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ + if (gap_start > high_limit) + return -ENOMEM; + +found: + /* We found a suitable gap. Clip it with the original low_limit. */ + if (gap_start < info->low_limit) + gap_start = info->low_limit; + + /* Adjust gap address to the desired alignment */ + gap_start += (info->align_offset - gap_start) & info->align_mask; + + VM_BUG_ON(gap_start + info->length > info->high_limit); + VM_BUG_ON(gap_start + info->length > gap_end); + return gap_start; +} + +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* + * Adjust search limits by the desired length. + * See implementation comment at top of unmapped_area(). + */ + gap_end = info->high_limit; + if (gap_end < length) + return -ENOMEM; + high_limit = gap_end - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + if (gap_start <= high_limit) + goto found_highest; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + return -ENOMEM; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + return -ENOMEM; + + while (true) { + /* Visit right subtree if it looks promising */ + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + if (gap_start <= high_limit && vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + +check_current: + /* Check if current node has a suitable gap */ + gap_end = vma->vm_start; + if (gap_end < low_limit) + return -ENOMEM; + if (gap_start <= high_limit && gap_end - gap_start >= length) + goto found; + + /* Visit left subtree if it looks promising */ + if (vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + return -ENOMEM; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_right) { + gap_start = vma->vm_prev ? + vma->vm_prev->vm_end : 0; + goto check_current; + } + } + } + +found: + /* We found a suitable gap. Clip it with the original high_limit. */ + if (gap_end > info->high_limit) + gap_end = info->high_limit; + +found_highest: + /* Compute highest gap address at the desired alignment */ + gap_end -= info->length; + gap_end -= (gap_end - info->align_offset) & info->align_mask; + + VM_BUG_ON(gap_end < info->low_limit); + VM_BUG_ON(gap_end < gap_start); + return gap_end; +} + /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -1408,9 +1873,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1419,56 +1884,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } - if (len > mm->cached_hole_size) { - start_addr = addr = mm->free_area_cache; - } else { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - addr = TASK_UNMAPPED_BASE; - start_addr = addr; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - } + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1481,10 +1910,11 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1494,58 +1924,17 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - * - * Note: this is different with the case of bottomup - * which does the fully line-search, but we use find_vma - * here that causes some holes skipped. - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, @@ -1553,32 +1942,18 @@ fail: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -1595,7 +1970,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) + if (file && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) @@ -1606,7 +1981,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr & ~PAGE_MASK) return -EINVAL; - return arch_rebalance_pgtables(addr, len); + addr = arch_rebalance_pgtables(addr, len); + error = security_mmap_addr(addr); + return error ? error : addr; } EXPORT_SYMBOL(get_unmapped_area); @@ -1614,36 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; - if (mm) { - /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node * rb_node; - - rb_node = mm->mm_rb.rb_node; - vma = NULL; - - while (rb_node) { - struct vm_area_struct * vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; - } + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -1716,7 +2090,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Ok, everything looks good - let it rip */ - mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); @@ -1768,13 +2141,34 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + vma->vm_mm->highest_vm_end = address; + spin_unlock(&vma->vm_mm->page_table_lock); + perf_event_mmap(vma); } } } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1795,7 +2189,7 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; address &= PAGE_MASK; - error = security_file_mmap(NULL, 0, 0, 0, address, 1); + error = security_mmap_addr(address); if (error) return error; @@ -1818,20 +2212,57 @@ int expand_downwards(struct vm_area_struct *vma, if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); + anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); + vma_gap_update(vma); + spin_unlock(&vma->vm_mm->page_table_lock); + perf_event_mmap(vma); } } } vma_unlock_anon_vma(vma); khugepaged_enter_vma_merge(vma); + validate_mm(vma->vm_mm); return error; } +/* + * Note how expand_stack() refuses to expand the stack all the way to + * abut the next virtual mapping, *unless* that mapping itself is also + * a stack mapping. We want to leave room for a guard page, after all + * (the guard page itself is not added here, that is done by the + * actual page faulting logic) + * + * This matches the behavior of the guard page logic (see mm/memory.c: + * check_stack_guard_page()), which only allows the guard page to be + * removed under these circumstances. + */ #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *next; + + address &= PAGE_MASK; + next = vma->vm_next; + if (next && next->vm_start == address + PAGE_SIZE) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + } return expand_upwards(vma, address); } @@ -1846,14 +2277,21 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(prev, addr, prev->vm_end); - } + if (prev->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { + struct vm_area_struct *prev; + + address &= PAGE_MASK; + prev = vma->vm_prev; + if (prev && prev->vm_end == address) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + } return expand_downwards(vma, address); } @@ -1874,9 +2312,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_pages_range(vma, addr, start); - } + if (vma->vm_flags & VM_LOCKED) + __mlock_vma_pages_range(vma, addr, start, NULL); return vma; } #endif @@ -1889,15 +2326,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { + unsigned long nr_accounted = 0; + /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); - mm->total_vm -= nrpages; + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); + vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -1912,15 +2353,13 @@ static void unmap_region(struct mm_struct *mm, { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; struct mmu_gather tlb; - unsigned long nr_accounted = 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); + unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : 0); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -1934,26 +2373,25 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - rb_erase(&vma->vm_rb, &mm->mm_rb); + vma_rb_erase(vma, &mm->mm_rb); mm->map_count--; tail_vma = vma; vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) + if (vma) { vma->vm_prev = prev; + vma_gap_update(vma); + } else + mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1963,7 +2401,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { - struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; @@ -1987,21 +2424,15 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); + err = vma_dup_policy(vma, new); + if (err) goto out_free_vma; - } - vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; - if (new->vm_file) { + if (new->vm_file) get_file(new->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); @@ -2019,14 +2450,11 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); - if (new->vm_file) { - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); + if (new->vm_file) fput(new->vm_file); - } unlink_anon_vmas(new); out_free_mpol: - mpol_put(pol); + mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: @@ -2132,7 +2560,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } -EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { @@ -2180,28 +2607,15 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); - if (error) - return error; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (error & ~PAGE_MASK) return error; - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = len >> PAGE_SHIFT; - locked += mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; /* * mm->mmap_sem is required to protect against another thread @@ -2213,8 +2627,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) * Clear old maps. this also does some error checking for us */ munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) return -ENOMEM; goto munmap_back; @@ -2256,10 +2669,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) { - if (!mlock_vma_pages_range(vma, addr, addr + len)) - mm->locked_vm += (len >> PAGE_SHIFT); - } + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; return addr; } @@ -2267,10 +2679,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; unsigned long ret; + bool populate; down_write(&mm->mmap_sem); ret = do_brk(addr, len); + populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + if (populate) + mm_populate(addr, len); return ret; } EXPORT_SYMBOL(vm_brk); @@ -2302,33 +2718,37 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 1); + tlb_gather_mmu(&tlb, mm, 0, -1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); + unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* * Walk the list again, actually closing and freeing it, * with preemption enabled, without holding any MM locks. */ - while (vma) + while (vma) { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); - BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_mutex is taken here. */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; /* * The vm_pgoff of a purely anonymous vma should be irrelevant @@ -2346,12 +2766,13 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; + vma_link(mm, vma, prev, rb_link, rb_parent); return 0; } @@ -2361,14 +2782,14 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; bool faulted_in_anon_vma = true; /* @@ -2380,7 +2801,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); if (new_vma) { @@ -2402,37 +2824,33 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * linear if there are no pages mapped yet. */ VM_BUG_ON(faulted_in_anon_vma); - *vmap = new_vma; - } else - anon_vma_moveto_tail(new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { *new_vma = *vma; - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (vma_dup_policy(vma, new_vma)) goto out_free_vma; INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) { + if (new_vma->vm_file) get_file(new_vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - added_exe_file_vma(mm); - } if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } } return new_vma; out_free_mempol: - mpol_put(pol); + mpol_put(vma_policy(new_vma)); out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); return NULL; @@ -2503,7 +2921,7 @@ static const struct vm_operations_struct special_mapping_vmops = { * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ -int install_special_mapping(struct mm_struct *mm, +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { @@ -2512,23 +2930,19 @@ int install_special_mapping(struct mm_struct *mm, vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); - if (ret) - goto out; - ret = insert_vm_struct(mm, vma); if (ret) goto out; @@ -2537,34 +2951,46 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = _install_special_mapping(mm, + addr, len, vm_flags, pages); + + if (IS_ERR(vma)) + return PTR_ERR(vma); + return 0; } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->root->mutex. If some other vma in this mm shares + * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); } } @@ -2605,12 +3031,12 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * - * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We can take all the locks in random order because the VM code - * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never + * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never * takes more than one of them in a row. Secondly we're protected * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. * @@ -2652,23 +3078,23 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->head will + * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->head.next)) + &anon_vma->root->rb_root.rb_node)) BUG(); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); } } @@ -2719,3 +3145,115 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +subsys_initcall(init_admin_reserve); + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + printk("Failed registering memory add/remove notifier for admin reserve"); + + return 0; +} +subsys_initcall(init_reserve_notifier); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 3dcfaf4ed355..f802c2d216a7 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -14,9 +14,6 @@ * use_mm * Makes the calling kernel thread take on the specified * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. * (Note: this routine is intended to be called only * from a kernel thread context) */ @@ -34,6 +31,9 @@ void use_mm(struct mm_struct *mm) tsk->mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif if (active_mm != mm) mmdrop(active_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3a1848..41cefdf0aadd 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -14,10 +14,14 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/slab.h> +/* global SRCU for all MMs */ +static struct srcu_struct srcu; + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -25,14 +29,31 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; + int id; + + /* + * SRCU here will block mmu_notifier_unregister until + * ->release returns. + */ + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -41,41 +62,24 @@ void __mmu_notifier_release(struct mm_struct *mm) hlist); /* * We arrived before mmu_notifier_unregister so - * mmu_notifier_unregister will do nothing other than - * to wait ->release to finish and - * mmu_notifier_unregister to return. + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. */ hlist_del_init_rcu(&mn->hlist); - /* - * RCU here will block mmu_notifier_unregister until - * ->release returns. - */ - rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); - /* - * if ->release runs before mmu_notifier_unregister it - * must be handled as it's the only way for the driver - * to flush all existing sptes and stop the driver - * from establishing any more sptes before all the - * pages in the mm are freed. - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - rcu_read_unlock(); - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to - * return to exit_mmap (which would proceed freeing all pages - * in the mm) until the ->release method returns, if it was - * invoked by mmu_notifier_unregister. + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. * - * The mmu_notifier_mm can't go away from under us because one - * mm_count is hold by exit_mmap. + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. */ - synchronize_rcu(); + synchronize_srcu(&srcu); } /* @@ -87,15 +91,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -104,18 +107,17 @@ int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); if (young) break; } } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -124,63 +126,59 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { struct mmu_notifier *mn; - struct hlist_node *n; + int id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); - /* - * Some drivers don't have change_pte, - * so we must call invalidate_page in that case. - */ - else if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { struct mmu_notifier *mn; - struct hlist_node *n; + int id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { struct mmu_notifier *mn; - struct hlist_node *n; + int id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { struct mmu_notifier *mn; - struct hlist_node *n; + int id; - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, @@ -191,6 +189,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -200,11 +204,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out_cleanup; + goto out_clean; if (!mm_has_notifiers(mm)) { INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; mmu_notifier_mm = NULL; } @@ -223,10 +228,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); -out_cleanup: +out_clean: if (take_mmap_sem) up_write(&mm->mmap_sem); - /* kfree() does nothing if mmu_notifier_mm is NULL */ kfree(mmu_notifier_mm); out: BUG_ON(atomic_read(&mm->mm_users) <= 0); @@ -273,8 +277,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -284,35 +288,45 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { - hlist_del_rcu(&mn->hlist); - /* - * RCU here will force exit_mmap to wait ->release to finish - * before freeing the pages. + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. */ - rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); + int id; + + id = srcu_read_lock(&srcu); /* - * exit_mmap will block in mmu_notifier_release to - * guarantee ->release is called before freeing the - * pages. + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. */ if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); - } else + srcu_read_unlock(&srcu, id); + + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* - * Wait any running method to finish, of course including - * ->release if it was run by mmu_notifier_relase instead of us. + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_release instead of us. */ - synchronize_rcu(); + synchronize_srcu(&srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} +subsys_initcall(mmu_notifier_init); diff --git a/mm/mmzone.c b/mm/mmzone.c index 7cf7b7ddc7c5..bf34fb8556db 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -1,7 +1,7 @@ /* * linux/mm/mmzone.c * - * management codes for pgdats and zones. + * management codes for pgdats, zones and page flags */ @@ -86,3 +86,31 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +void lruvec_init(struct lruvec *lruvec) +{ + enum lru_list lru; + + memset(lruvec, 0, sizeof(struct lruvec)); + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); +} + +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) +{ + unsigned long old_flags, flags; + int last_cpupid; + + do { + old_flags = flags = page->flags; + last_cpupid = page_cpupid_last(page); + + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_cpupid; +} +#endif diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab6..c43d557941f8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/perf_event.h> +#include <linux/ksm.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -35,110 +36,224 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; + unsigned long pages = 0; + + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); do { oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; - ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); - - /* - * Avoid taking write faults for pages we know to be - * dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) - ptent = pte_mkwrite(ptent); - - ptep_modify_prot_commit(mm, addr, pte, ptent); + if (!prot_numa) { + ptent = ptep_modify_prot_start(mm, addr, pte); + if (pte_numa(ptent)) + ptent = pte_mknonnuma(ptent); + ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page && !PageKsm(page)) { + if (!pte_numa(oldpte)) { + ptep_set_numa(mm, addr, pte); + updated = true; + } + } + } + if (updated) + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { + pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); - set_pte_at(mm, addr, pte, - swp_entry_to_pte(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); + + pages++; } } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + + return pages; } -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; unsigned long next; + unsigned long pages = 0; + unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; pmd = pmd_offset(pud, addr); do { + unsigned long this_pages; + next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) - continue; - /* fall through */ + split_huge_page_pmd(vma, addr, pmd); + else { + int nr_ptes = change_huge_pmd(vma, pmd, addr, + newprot, prot_numa); + + if (nr_ptes) { + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } + + /* huge pmd was handled */ + continue; + } + } + /* fall through, the trans huge pmd just split */ } - if (pmd_none_or_clear_bad(pmd)) - continue; - change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + this_pages = change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + pages += this_pages; } while (pmd++, addr = next, addr != end); + + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); + return pages; } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; + unsigned long pages = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + pages += change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); + + return pages; } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; unsigned long start = addr; + unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + pages += change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); - flush_tlb_range(vma, start, end); + + /* Only flush the TLB if we actually modified any entries: */ + if (pages) + flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); + + return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa) +{ + unsigned long pages; + + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot); + else + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + + return pages; } int @@ -213,12 +328,9 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); @@ -274,8 +386,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -EINVAL; if (!(vma->vm_flags & VM_GROWSDOWN)) goto out; - } - else { + } else { if (vma->vm_start > start) goto out; if (unlikely(grows & PROT_GROWSUP)) { @@ -291,9 +402,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5a7d..05f1180e9f21 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -15,10 +15,12 @@ #include <linux/swap.h> #include <linux/capability.h> #include <linux/fs.h> +#include <linux/swapops.h> #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> +#include <linux/sched/sysctl.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -68,25 +70,61 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } +static pte_t move_soft_dirty_pte(pte_t pte) +{ + /* + * Set soft dirty bit so we can notice + * in userspace the ptes were moved. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_mksoft_dirty(pte); + else if (pte_file(pte)) + pte = pte_file_mksoft_dirty(pte); +#endif + return pte; +} + static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) + unsigned long new_addr, bool need_rmap_locks) { struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock truncate_pagecache - * out, since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + /* + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock_write(anon_vma); + } } /* @@ -106,6 +144,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + pte = move_soft_dirty_pte(pte); set_pte_at(mm, new_addr, new_pte, pte); } @@ -114,6 +153,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock_write(anon_vma); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); } @@ -122,16 +163,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len) + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); @@ -148,15 +194,22 @@ unsigned long move_page_tables(struct vm_area_struct *vma, break; if (pmd_trans_huge(*old_pmd)) { int err = 0; - if (extent == HPAGE_PMD_SIZE) + if (extent == HPAGE_PMD_SIZE) { + VM_BUG_ON(vma->vm_file || !vma->anon_vma); + /* See comment in move_ptes() */ + if (need_rmap_locks) + anon_vma_lock_write(vma->anon_vma); err = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); + if (need_rmap_locks) + anon_vma_unlock_write(vma->anon_vma); + } if (err > 0) { need_flush = true; continue; } else if (!err) { - split_huge_page_pmd(vma->vm_mm, old_pmd); + split_huge_page_pmd(vma, old_addr, old_pmd); } VM_BUG_ON(pmd_trans_huge(*old_pmd)); } @@ -169,20 +222,20 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + new_vma, new_pmd, new_addr, need_rmap_locks); need_flush = true; } if (likely(need_flush)) flush_tlb_range(vma, old_end-len, old_addr); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); return len + old_addr - old_end; /* how much done */ } static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, - unsigned long new_len, unsigned long new_addr) + unsigned long new_len, unsigned long new_addr, bool *locked) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -193,6 +246,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; int err; + bool need_rmap_locks; /* * We'd prefer to avoid failure later on in do_munmap: @@ -214,27 +268,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); if (!new_vma) return -ENOMEM; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); if (moved_len < old_len) { /* - * Before moving the page tables from the new vma to - * the old vma, we need to be sure the old vma is - * queued after new vma in the same_anon_vma list to - * prevent SMP races with rmap_walk (that could lead - * rmap_walk to miss some page table). - */ - anon_vma_moveto_tail(vma); - - /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); vma = new_vma; old_len = new_len; old_addr = new_addr; @@ -260,7 +308,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (do_munmap(mm, old_addr, old_len) < 0) { @@ -279,9 +326,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; - if (new_len > old_len) - mlock_vma_pages_range(new_vma, new_addr + old_len, - new_addr + new_len); + *locked = true; } return new_addr; @@ -346,9 +391,8 @@ Eagain: return ERR_PTR(-EAGAIN); } -static unsigned long mremap_to(unsigned long addr, - unsigned long old_len, unsigned long new_addr, - unsigned long new_len) +static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len, bool *locked) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -371,10 +415,6 @@ static unsigned long mremap_to(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; @@ -402,7 +442,7 @@ static unsigned long mremap_to(unsigned long addr, if (ret & ~PAGE_MASK) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); if (!(ret & ~PAGE_MASK)) goto out; out1: @@ -432,20 +472,24 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise * This option implies MREMAP_MAYMOVE. */ -unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; + bool locked = false; if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; if (addr & ~PAGE_MASK) - goto out; + return ret; old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len); @@ -456,11 +500,13 @@ unsigned long do_mremap(unsigned long addr, * a zero new-len is nonsensical. */ if (!new_len) - goto out; + return ret; + + down_write(¤t->mm->mmap_sem); if (flags & MREMAP_FIXED) { - if (flags & MREMAP_MAYMOVE) - ret = mremap_to(addr, old_len, new_addr, new_len); + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); goto out; } @@ -499,12 +545,11 @@ unsigned long do_mremap(unsigned long addr, goto out; } - mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - mlock_vma_pages_range(vma, addr + old_len, - addr + new_len); + locked = true; + new_addr = addr; } ret = addr; goto out; @@ -530,25 +575,13 @@ unsigned long do_mremap(unsigned long addr, goto out; } - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - ret = move_vma(vma, addr, old_len, new_len, new_addr); + ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); - return ret; -} - -SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, - unsigned long, new_len, unsigned long, flags, - unsigned long, new_addr) -{ - unsigned long ret; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); up_write(¤t->mm->mmap_sem); + if (locked && new_len > old_len) + mm_populate(new_addr + old_len, new_len - old_len); return ret; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c7026..04a9d94333a5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,13 +41,15 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(goal, limit, size, align, nid); + addr = memblock_find_in_range_node(size, align, goal, limit, nid); if (!addr) return NULL; + if (memblock_reserve(addr, size)) + return NULL; + ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_reserve(addr, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. @@ -82,65 +84,82 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static void __init __free_pages_memory(unsigned long start, unsigned long end) { - unsigned long i, start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); + int order; - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + while (start + (1UL << order) > end) + order--; - return; + __free_pages_bootmem(pfn_to_page(start), order); + + start += (1UL << order); } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + if (start_pfn > end_pfn) + return 0; - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); + __free_pages_memory(start_pfn, end_pfn); - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + return end_pfn - start_pfn; } -unsigned long __init free_low_memory_core_early(int nodeid) +static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; phys_addr_t start, end; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) + count += __free_memory_core(start, end); + +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + { + phys_addr_t size; + + /* Free memblock.reserved array if it was allocated */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); + + /* Free memblock.memory array if it was allocated */ + size = get_allocated_memblock_memory_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); } +#endif - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { - register_page_bootmem_info_node(pgdat); + struct zone *z; - /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; + if (reset_managed_pages_done) + return; + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -150,14 +169,19 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { + unsigned long pages; + + reset_all_zones_managed_pages(); + /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related */ - return free_low_memory_core_early(MAX_NUMNODES); + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; } /** @@ -204,7 +228,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); if (ptr) return ptr; @@ -274,86 +298,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -/** - * __alloc_bootmem_node - allocate boot memory from a specific node - * @pgdat: node to allocate from - * @size: size of the request in bytes - * @align: alignment of the region - * @goal: preferred starting address of the region - * - * The goal is dropped if it can not be satisfied and the allocation will - * fall back to memory below @goal. - * - * Allocation may fall back to any node in the system if the specified node - * can not hold the requested memory. - * - * The function panics if the request can not be satisfied. - */ -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) { void *ptr; - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - again: ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); + goal, limit); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, + goal, limit); if (ptr) return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); - if (!ptr && goal) { + if (goal) { goal = 0; goto again; } - return ptr; + + return NULL; } -void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(pgdat, size, align, goal); + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -#ifdef CONFIG_SPARSEMEM -/** - * alloc_bootmem_section - allocate boot memory from a specific section - * @size: size of the request in bytes - * @section_nr: sparse map section to allocate from - * - * Return NULL on failure. - */ -void * __init alloc_bootmem_section(unsigned long size, - unsigned long section_nr) +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) { - unsigned long pfn, goal, limit; + void *ptr; - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); + if (ptr) + return ptr; - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; } -#endif -void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); +} - return __alloc_bootmem_nopanic(size, align, goal); +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_node(pgdat, size, align, goal); } #ifndef ARCH_LOW_ADDRESS_LIMIT @@ -379,6 +402,14 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + /** * __alloc_bootmem_low_node - allocate low boot memory from a specific node * @pgdat: node to allocate from @@ -397,16 +428,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - - return __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/nommu.c b/mm/nommu.c index bb8f4f004a82..85f8d6698d48 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,7 @@ #include <linux/export.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -24,11 +25,13 @@ #include <linux/vmalloc.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/compiler.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> +#include <linux/sched/sysctl.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -55,19 +58,35 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; -unsigned long num_physpages; unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} + +EXPORT_SYMBOL_GPL(vm_memory_committed); + EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(num_physpages); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; @@ -124,10 +143,10 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *retry) +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -174,9 +193,10 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int nr_pages, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { int flags = 0; @@ -211,8 +231,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; +LIST_HEAD(vmap_area_list); void vfree(const void *addr) { @@ -264,6 +283,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + memcpy(buf, addr, count); return count; } @@ -275,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; memcpy(addr, buf, count); - return(count); + return count; } /* @@ -438,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -698,7 +721,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -747,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(curr->mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -764,7 +794,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -789,11 +819,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) { + if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(mm); - } put_nommu_region(vma->vm_region); kmem_cache_free(vm_area_cachep, vma); } @@ -807,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -817,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -856,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -868,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -889,7 +916,6 @@ static int validate_mmap_request(struct file *file, unsigned long *_capabilities) { unsigned long capabilities, rlen; - unsigned long reqprot = prot; int ret; /* do the simple checks first */ @@ -921,7 +947,7 @@ static int validate_mmap_request(struct file *file, struct address_space *mapping; /* files must support mmap */ - if (!file->f_op || !file->f_op->mmap) + if (!file->f_op->mmap) return -ENODEV; /* work out if what we've got could possibly be shared @@ -930,7 +956,7 @@ static int validate_mmap_request(struct file *file, */ mapping = file->f_mapping; if (!mapping) - mapping = file->f_path.dentry->d_inode->i_mapping; + mapping = file_inode(file)->i_mapping; capabilities = 0; if (mapping && mapping->backing_dev_info) @@ -939,7 +965,7 @@ static int validate_mmap_request(struct file *file, if (!capabilities) { /* no explicit capabilities set, so assume some * defaults */ - switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { + switch (file_inode(file)->i_mode & S_IFMT) { case S_IFREG: case S_IFBLK: capabilities = BDI_CAP_MAP_COPY; @@ -974,11 +1000,11 @@ static int validate_mmap_request(struct file *file, !(file->f_mode & FMODE_WRITE)) return -EACCES; - if (IS_APPEND(file->f_path.dentry->d_inode) && + if (IS_APPEND(file_inode(file)) && (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file->f_path.dentry->d_inode)) + if (locks_verify_locked(file)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) @@ -986,8 +1012,7 @@ static int validate_mmap_request(struct file *file, /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; - } - else { + } else { /* we're going to read the file into private memory we * allocate */ if (!(capabilities & BDI_CAP_MAP_COPY)) @@ -1018,23 +1043,20 @@ static int validate_mmap_request(struct file *file, if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; - } - else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { if (capabilities & BDI_CAP_EXEC_MAP) prot |= PROT_EXEC; } - } - else if ((prot & PROT_READ) && + } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP) ) { /* backing file is not executable, try to copy */ capabilities &= ~BDI_CAP_MAP_DIRECT; } - } - else { + } else { /* anonymous mappings are always memory backed and can be * privately mapped */ @@ -1047,7 +1069,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); + ret = security_mmap_addr(addr); if (ret < 0) return ret; @@ -1233,12 +1255,13 @@ enomem: /* * handle mapping creation for uClinux */ -static unsigned long do_mmap_pgoff(struct file *file, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff) + unsigned long pgoff, + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; @@ -1248,6 +1271,8 @@ static unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + *populate = 0; + /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1283,14 +1308,8 @@ static unsigned long do_mmap_pgoff(struct file *file, vma->vm_pgoff = pgoff; if (file) { - region->vm_file = file; - get_file(file); - vma->vm_file = file; - get_file(file); - if (vm_flags & VM_EXECUTABLE) { - added_exe_file_vma(current->mm); - vma->vm_mm = current->mm; - } + region->vm_file = get_file(file); + vma->vm_file = get_file(file); } down_write(&nommu_region_sem); @@ -1317,8 +1336,8 @@ static unsigned long do_mmap_pgoff(struct file *file, continue; /* search for overlapping mappings on the same file */ - if (pregion->vm_file->f_path.dentry->d_inode != - file->f_path.dentry->d_inode) + if (file_inode(pregion->vm_file) != + file_inode(file)) continue; if (pregion->vm_pgoff >= pgend) @@ -1443,8 +1462,6 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); kleave(" = %d", ret); return ret; @@ -1471,32 +1488,6 @@ error_getting_region: return -ENOMEM; } -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); - -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(vm_mmap); - SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) @@ -1513,9 +1504,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); @@ -1675,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - static int limit = 0; + static int limit; if (limit < 5) { printk(KERN_WARNING "munmap of memory not mmapped by process %d" @@ -1790,7 +1779,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) * * MREMAP_FIXED is not supported under NOMMU conditions */ -unsigned long do_mremap(unsigned long addr, +static unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { @@ -1825,7 +1814,6 @@ unsigned long do_mremap(unsigned long addr, vma->vm_end = vma->vm_start + new_len; return vma->vm_start; } -EXPORT_SYMBOL(do_mremap); SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, @@ -1839,9 +1827,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) { + *page_mask = 0; return NULL; } @@ -1851,11 +1841,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long vm_len = vma->vm_end - vma->vm_start; + + pfn += vma->vm_pgoff; + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { @@ -1877,10 +1877,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) @@ -1906,7 +1902,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -1928,7 +1924,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) */ free -= global_page_state(NR_SHMEM); - free += nr_swap_pages; + free += get_nr_swap_pages(); /* * Any slabs which are created with the @@ -1947,10 +1943,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -1958,18 +1954,20 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* - * Leave the last 3% for root + * Reserve some 3% for root */ if (!cap_sys_admin) - allowed -= allowed / 32; - allowed += total_swap_pages; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -1992,6 +1990,20 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); +} +EXPORT_SYMBOL(filemap_map_pages); + +int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, + unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(generic_file_remap_pages); + static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, int write) { @@ -2076,7 +2088,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2088,8 +2099,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2105,8 +2115,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; @@ -2125,3 +2135,45 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, up_write(&nommu_region_sem); return 0; } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46bf2ed5594c..3291e82d4352 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,64 +44,24 @@ int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj - * @old_val: old oom_score_adj for compare - * @new_val: new oom_score_adj for swap - * - * Sets the oom_score_adj value for current to @new_val iff its present value is - * @old_val. Usually used to reinstate a previous value to prevent racing with - * userspacing tuning the value in the interim. - */ -void compare_swap_oom_score_adj(int old_val, int new_val) -{ - struct sighand_struct *sighand = current->sighand; - - spin_lock_irq(&sighand->siglock); - if (current->signal->oom_score_adj == old_val) - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); -} - -/** - * test_set_oom_score_adj() - set current's oom_score_adj and return old value - * @new_val: new oom_score_adj value - * - * Sets the oom_score_adj value for current to @new_val with proper - * synchronization and returns the old value. Usually used to temporarily - * set a value, save the old value in the caller, and then reinstate it later. - */ -int test_set_oom_score_adj(int new_val) -{ - struct sighand_struct *sighand = current->sighand; - int old_val; - - spin_lock_irq(&sighand->siglock); - old_val = current->signal->oom_score_adj; - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); - - return old_val; -} - #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill - * @tsk: task struct of which task to consider + * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. */ -static bool has_intersects_mems_allowed(struct task_struct *tsk, +static bool has_intersects_mems_allowed(struct task_struct *start, const nodemask_t *mask) { - struct task_struct *start = tsk; + struct task_struct *tsk; + bool ret = false; - do { + rcu_read_lock(); + for_each_thread(start, tsk) { if (mask) { /* * If this is a mempolicy constrained oom, tsk's @@ -109,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, * mempolicy intersects current, otherwise it may be * needlessly killed. */ - if (mempolicy_nodemask_intersects(tsk, mask)) - return true; + ret = mempolicy_nodemask_intersects(tsk, mask); } else { /* * This is not a mempolicy constrained oom, so only * check the mems of tsk's cpuset. */ - if (cpuset_mems_allowed_intersects(current, tsk)) - return true; + ret = cpuset_mems_allowed_intersects(current, tsk); } - } while_each_thread(start, tsk); + if (ret) + break; + } + rcu_read_unlock(); - return false; + return ret; } #else static bool has_intersects_mems_allowed(struct task_struct *tsk, @@ -139,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, */ struct task_struct *find_lock_task_mm(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; + + rcu_read_lock(); - do { + for_each_thread(p, t) { task_lock(t); if (likely(t->mm)) - return t; + goto found; task_unlock(t); - } while_each_thread(p, t); + } + t = NULL; +found: + rcu_read_unlock(); - return NULL; + return t; } /* return true if the task is not adequate as candidate victim task. */ @@ -180,10 +146,11 @@ static bool oom_unkillable_task(struct task_struct *p, * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, + const nodemask_t *nodemask, unsigned long totalpages) { long points; + long adj; if (oom_unkillable_task(p, memcg, nodemask)) return 0; @@ -192,27 +159,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + adj = (long)p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; } /* - * The memory controller may have a limit of 0 bytes, so avoid a divide - * by zero, if necessary. - */ - if (!totalpages) - totalpages = 1; - - /* * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes; - points += get_mm_counter(p->mm, MM_SWAPENTS); - - points *= 1000; - points /= totalpages; + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); /* @@ -220,23 +178,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * implementation used by LSMs. */ if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= 30; + points -= (points * 3) / 100; - /* - * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may - * either completely disable oom killing or always prefer a certain - * task. - */ - points += p->signal->oom_score_adj; + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; /* - * Never return 0 for an eligible task that may be killed since it's - * possible that no single user task uses more than 0.1% of memory and - * no single admin tasks uses more than 3.0%. + * Never return 0 for an eligible task regardless of the root bonus and + * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). */ - if (points <= 0) - return 1; - return (points < 1000) ? points : 1000; + return points > 0 ? points : 1; } /* @@ -271,7 +223,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * the page allocator means a mempolicy is in effect. Cpuset policy * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { *totalpages = total_swap_pages; for_each_node_mask(nid, *nodemask) *totalpages += node_spanned_pages(nid); @@ -302,99 +254,114 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, } #endif +enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) +{ + if (task->exit_state) + return OOM_SCAN_CONTINUE; + if (oom_unkillable_task(task, NULL, nodemask)) + return OOM_SCAN_CONTINUE; + + /* + * This task already has access to memory reserves and is being killed. + * Don't allow any other task to have access to the reserves. + */ + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + if (unlikely(frozen(task))) + __thaw_task(task); + if (!force_kill) + return OOM_SCAN_ABORT; + } + if (!task->mm) + return OOM_SCAN_CONTINUE; + + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + + if (task->flags & PF_EXITING && !force_kill) { + /* + * If this task is not being ptraced on exit, then wait for it + * to finish before killing some other task unnecessarily. + */ + if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) + return OOM_SCAN_ABORT; + } + return OOM_SCAN_OK; +} + /* * Simple selection loop. We chose the process with the highest - * number of 'points'. We expect the caller will lock the tasklist. + * number of 'points'. Returns -1 on scan abort. * * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *memcg, - const nodemask_t *nodemask, bool force_kill) + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; - *ppoints = 0; + unsigned long chosen_points = 0; - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { unsigned int points; - if (p->exit_state) + switch (oom_scan_process_thread(p, totalpages, nodemask, + force_kill)) { + case OOM_SCAN_SELECT: + chosen = p; + chosen_points = ULONG_MAX; + /* fall through */ + case OOM_SCAN_CONTINUE: continue; - if (oom_unkillable_task(p, memcg, nodemask)) + case OOM_SCAN_ABORT: + rcu_read_unlock(); + return (struct task_struct *)(-1UL); + case OOM_SCAN_OK: + break; + }; + points = oom_badness(p, NULL, nodemask, totalpages); + if (!points || points < chosen_points) continue; - - /* - * This task already has access to memory reserves and is - * being killed. Don't allow any other task access to the - * memory reserve. - * - * Note: this may have a chance of deadlock if it gets - * blocked waiting for another task which itself is waiting - * for memory. Is there a better alternative? - */ - if (test_tsk_thread_flag(p, TIF_MEMDIE)) { - if (unlikely(frozen(p))) - __thaw_task(p); - if (!force_kill) - return ERR_PTR(-1UL); - } - if (!p->mm) + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && thread_group_leader(chosen)) continue; - if (p->flags & PF_EXITING) { - /* - * If p is the current task and is in the process of - * releasing memory, we allow the "kill" to set - * TIF_MEMDIE, which will allow it to gain access to - * memory reserves. Otherwise, it may stall forever. - * - * The loop isn't broken here, however, in case other - * threads are found to have already been oom killed. - */ - if (p == current) { - chosen = p; - *ppoints = 1000; - } else if (!force_kill) { - /* - * If this task is not being ptraced on exit, - * then wait for it to finish before killing - * some other task unnecessarily. - */ - if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) - return ERR_PTR(-1UL); - } - } - - points = oom_badness(p, memcg, nodemask, totalpages); - if (points > *ppoints) { - chosen = p; - *ppoints = points; - } - } while_each_thread(g, p); + chosen = p; + chosen_points = points; + } + if (chosen) + get_task_struct(chosen); + rcu_read_unlock(); + *ppoints = chosen_points * 1000 / totalpages; return chosen; } /** * dump_tasks - dump current memory state of all system tasks - * @mem: current's memory controller, if constrained + * @memcg: current's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * value, oom_score_adj value, and name. - * - * Call with tasklist_lock read-locked. + * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, + * swapents, oom_score_adj value, and name. */ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) continue; @@ -409,13 +376,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", - task->pid, task_uid(task), task->tgid, - task->mm->total_vm, get_mm_rss(task->mm), - task_cpu(task), task->signal->oom_adj, + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + atomic_long_read(&task->mm->nr_ptes), + get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); } + rcu_read_unlock(); } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, @@ -423,27 +392,33 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d, oom_score_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj, + "oom_score_adj=%hd\n", + current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(memcg, p); - show_mem(SHOW_MEM_FILTER_NODES); + if (memcg) + mem_cgroup_print_oom_info(memcg, p); + else + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(memcg, nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) -static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message) +/* + * Must be called while holding a reference to p, which will be released upon + * returning. + */ +void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message) { struct task_struct *victim = p; struct task_struct *child; - struct task_struct *t = p; + struct task_struct *t; struct mm_struct *mm; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -455,6 +430,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); + put_task_struct(p); return; } @@ -472,7 +448,8 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ - do { + read_lock(&tasklist_lock); + for_each_thread(p, t) { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -484,15 +461,24 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { + put_task_struct(victim); victim = child; victim_points = child_points; + get_task_struct(victim); } } - } while_each_thread(p, t); + } + read_unlock(&tasklist_lock); - victim = find_lock_task_mm(victim); - if (!victim) + p = find_lock_task_mm(victim); + if (!p) { + put_task_struct(victim); return; + } else if (victim != p) { + get_task_struct(p); + put_task_struct(victim); + victim = p; + } /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; @@ -511,6 +497,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * That thread will now get access to memory reserves since it has a * pending fatal signal. */ + rcu_read_lock(); for_each_process(p) if (p->mm == mm && !same_thread_group(p, victim) && !(p->flags & PF_KTHREAD)) { @@ -523,17 +510,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } + rcu_read_unlock(); set_tsk_thread_flag(victim, TIF_MEMDIE); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + put_task_struct(victim); } #undef K /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) +void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask) { if (likely(!sysctl_panic_on_oom)) return; @@ -546,42 +535,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - read_lock(&tasklist_lock); dump_header(NULL, gfp_mask, order, NULL, nodemask); - read_unlock(&tasklist_lock); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) -{ - unsigned long limit; - unsigned int points = 0; - struct task_struct *p; - - /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. - */ - if (fatal_signal_pending(current)) { - set_thread_flag(TIF_MEMDIE); - return; - } - - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; - read_lock(&tasklist_lock); - p = select_bad_process(&points, limit, memcg, NULL, false); - if (p && PTR_ERR(p) != -1UL) - oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, - "Memory cgroup out of memory"); - read_unlock(&tasklist_lock); -} -#endif - static BLOCKING_NOTIFIER_HEAD(oom_notify_list); int register_oom_notifier(struct notifier_block *nb) @@ -646,43 +604,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) spin_unlock(&zone_scan_lock); } -/* - * Try to acquire the oom killer lock for all system zones. Returns zero if a - * parallel oom killing is taking place, otherwise locks all zones and returns - * non-zero. - */ -static int try_set_system_oom(void) -{ - struct zone *zone; - int ret = 1; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - if (zone_is_oom_locked(zone)) { - ret = 0; - goto out; - } - for_each_populated_zone(zone) - zone_set_flag(zone, ZONE_OOM_LOCKED); -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation - * attempts or page faults may now recall the oom killer, if necessary. - */ -static void clear_system_oom(void) -{ - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_populated_zone(zone) - zone_clear_flag(zone, ZONE_OOM_LOCKED); - spin_unlock(&zone_scan_lock); -} - /** * out_of_memory - kill the "best" process when we run out of memory * @zonelist: zonelist pointer @@ -703,7 +624,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; - unsigned int points; + unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; int killed = 0; @@ -713,11 +634,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, return; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } @@ -731,52 +652,51 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); - read_lock(&tasklist_lock); - if (sysctl_oom_kill_allocating_task && + if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && - current->mm) { + current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(current); oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, nodemask, "Out of memory (oom_kill_allocating_task)"); goto out; } - p = select_bad_process(&points, totalpages, NULL, mpol_mask, - force_kill); + p = select_bad_process(&points, totalpages, mpol_mask, force_kill); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); - read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } - if (PTR_ERR(p) != -1UL) { + if (p != (void *)-1UL) { oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, nodemask, "Out of memory"); killed = 1; } out: - read_unlock(&tasklist_lock); - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory unless "p" is current + * Give the killed threads a good chance of exiting before trying to + * allocate memory again. */ - if (killed && !test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + if (killed) + schedule_timeout_killable(1); } /* * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel - * oom killing is already in progress so do nothing. If a task is found with - * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a + * parallel oom killing is already in progress so do nothing. */ void pagefault_out_of_memory(void) { - if (try_set_system_oom()) { + struct zonelist *zonelist; + + if (mem_cgroup_oom_synchronize(true)) + return; + + zonelist = node_zonelist(first_online_node, GFP_KERNEL); + if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); - clear_system_oom(); + clear_zonelist_oom(zonelist, GFP_KERNEL); } - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8ca2e7..a4317da60532 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,8 +34,13 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> +#include <linux/timer.h> +#include <linux/sched/rt.h> +#include <linux/mm_inline.h> #include <trace/events/writeback.h> +#include "internal.h" + /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ @@ -135,7 +140,20 @@ unsigned long global_dirty_limit; * measured in page writeback completions. * */ -static struct prop_descriptor vm_completions; +static struct fprop_global writeout_completions; + +static void writeout_period(unsigned long t); +/* Timer for aging of writeout_completions */ +static struct timer_list writeout_period_timer = + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); +static unsigned long writeout_period_time = 0; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* * Work out the current dirty-memory clamping and background writeout @@ -173,6 +191,26 @@ static struct prop_descriptor vm_completions; * global dirtyable memory first. */ +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + unsigned long nr_pages; + + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + + nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); + nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + + return nr_pages; +} + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -180,13 +218,23 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) unsigned long x = 0; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z) - z->dirty_balance_reserve; + x += zone_dirtyable_memory(z); } /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + + /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure @@ -204,12 +252,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) * Returns the global number of pages potentially available for dirty * page cache. This is the base value for the global dirty limits. */ -unsigned long global_dirtyable_memory(void) +static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - - dirty_balance_reserve; + x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + + x += global_page_state(NR_INACTIVE_FILE); + x += global_page_state(NR_ACTIVE_FILE); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -259,29 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) } /** - * zone_dirtyable_memory - number of dirtyable pages in a zone - * @zone: the zone - * - * Returns the zone's number of pages potentially available for dirty - * page cache. This is the base value for the per-zone dirty limits. - */ -static unsigned long zone_dirtyable_memory(struct zone *zone) -{ - /* - * The effective global number of dirtyable pages may exclude - * highmem as a big-picture measure to keep the ratio between - * dirty memory and lowmem reasonable. - * - * But this function is purely about the individual zone and a - * highmem zone can hold its share of dirty pages, so we don't - * care about vm_highmem_is_dirtyable here. - */ - return zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone) - - zone->dirty_balance_reserve; -} - -/** * zone_dirty_limit - maximum number of dirty pages allowed in a zone * @zone: the zone * @@ -322,34 +350,6 @@ bool zone_dirty_ok(struct zone *zone) zone_page_state(zone, NR_WRITEBACK) <= limit; } -/* - * couple the period to the dirty_ratio: - * - * period/2 ~ roundup_pow_of_two(dirty limit) - */ -static int calc_period_shift(void) -{ - unsigned long dirty_total; - - if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; - else - dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / - 100; - return 2 + ilog2(dirty_total - 1); -} - -/* - * update the period when the dirty threshold changes. - */ -static void update_completion_period(void) -{ - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - - writeback_set_ratelimit(); -} - int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -383,7 +383,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; @@ -398,12 +398,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + /* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). @@ -411,8 +420,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { __inc_bdi_stat(bdi, BDI_WRITTEN); - __prop_inc_percpu_max(&vm_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, + bdi->max_prop_frac); + /* First event after period switching was turned off? */ + if (!unlikely(writeout_period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + writeout_period_time = wp_next_time(jiffies); + mod_timer(&writeout_period_timer, writeout_period_time); + } } void bdi_writeout_inc(struct backing_dev_info *bdi) @@ -431,11 +451,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + fprop_fraction_percpu(&writeout_completions, &bdi->completions, numerator, denominator); } /* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(unsigned long t) +{ + int miss_periods = (jiffies - writeout_period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&writeout_completions, miss_periods + 1)) { + writeout_period_time = wp_next_time(writeout_period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&writeout_period_timer, writeout_period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + writeout_period_time = 0; + } +} + +/* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. @@ -475,7 +517,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); @@ -538,6 +580,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) } /* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + limit - setpoint + 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + +/* * Dirty position control. * * (o) global/bdi setpoints @@ -635,26 +708,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * - * setpoint - dirty 3 - * f(dirty) := 1.0 + (----------------) - * limit - setpoint + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks bdi counters + * against bdi limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * it's a 3rd order polynomial that subjects to + * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * about ~6K pages (as the average of background and throttle bdi + * limits). The 3rd order polynomial will provide positive feedback if + * bdi_dirty is under bdi_setpoint and vice versa. * - * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast - * (2) f(setpoint) = 1.0 => the balance point - * (3) f(limit) = 0 => the hard limit - * (4) df/dx <= 0 => negative feedback control - * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) - * => fast response on large errors; small oscillation near setpoint + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit BDI + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). */ - setpoint = (freerun + limit) / 2; - x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); - pos_ratio = x; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long bdi_pos_ratio; + unsigned long bdi_bg_thresh; + + if (bdi_dirty < 8) + return min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + + if (bdi_dirty >= bdi_thresh) + return 0; + + bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); + bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, + bdi_bg_thresh); + + if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) + return 0; + + bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, + bdi_thresh); + + /* + * Typically, for strictlimit case, bdi_setpoint << setpoint + * and pos_ratio >> bdi_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on bdi counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * BDIs) while given strictlimit BDI is below limit. + * + * "pos_ratio * bdi_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit BDI + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and bdi is well below bdi + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + return min(pos_ratio, bdi_pos_ratio); + } /* * We have computed basic pos_ratio above based on global situation. If @@ -715,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -918,7 +1045,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * bdi->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated - * code makes use of task_ratelimit to filter out sigular points and + * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of @@ -941,12 +1068,33 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size - * and filter out the sigular points of balanced_dirty_ratelimit. Which + * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; + + /* + * For strictlimit case, calculations above were based on bdi counters + * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use bdi_dirty as + * "dirty" and bdi_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if bdi_dirty is low because + * it's possible that bdi_thresh is close to zero due to inactivity + * of backing device (see the implementation of bdi_dirty_limit()). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = bdi_dirty; + if (bdi_dirty < 8) + setpoint = bdi_dirty + 1; + else + setpoint = (bdi_thresh + + bdi_dirty_limit(bdi, bg_thresh)) / 2; + } + if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); @@ -1041,7 +1189,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, } /* - * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive @@ -1057,11 +1205,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) { - long bw = bdi->avg_write_bandwidth; - long t; + unsigned long bw = bdi->avg_write_bandwidth; + unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long @@ -1073,7 +1221,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; - return min_t(long, t, MAX_PAUSE); + return min_t(unsigned long, t, MAX_PAUSE); } static long bdi_min_pause(struct backing_dev_info *bdi, @@ -1151,6 +1299,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } +static inline void bdi_dirty_limits(struct backing_dev_info *bdi, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) +{ + unsigned long bdi_reclaimable; + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + */ + *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (bdi_bg_thresh) + *bdi_bg_thresh = div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh); + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); + } +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1162,13 +1360,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long bdi_dirty; - unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; long period; long pause; long max_pause; @@ -1179,10 +1373,16 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; + unsigned long uninitialized_var(bdi_thresh); + unsigned long thresh; + unsigned long uninitialized_var(bdi_dirty); + unsigned long dirty; + unsigned long bg_thresh; /* * Unstable writes are a feature of certain networked @@ -1196,61 +1396,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); + if (unlikely(strictlimit)) { + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); + + dirty = bdi_dirty; + thresh = bdi_thresh; + } else { + dirty = nr_dirty; + thresh = dirty_thresh; + bg_thresh = background_thresh; + } + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. + * when the bdi limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the bdi counters + * and limits. Small writeouts when the bdi limits are ramping + * up are the price we consciously pay for strictlimit-ing. */ - freerun = dirty_freerun_ceiling(dirty_thresh, - background_thresh); - if (nr_dirty <= freerun) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { current->dirty_paused_when = now; current->nr_dirtied = 0; current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); + dirty_poll_interval(dirty, thresh); break; } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); - /* - * bdi_thresh is not treated as some limiting factor as - * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot - * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. - * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. - */ - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. - */ - if (bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); - } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); - } + if (!strictlimit) + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && - (nr_dirty > dirty_thresh); + ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; @@ -1379,9 +1562,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) @@ -1408,9 +1591,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied - * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -1421,8 +1603,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, - unsigned long nr_pages_dirtied) +void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ratelimit; @@ -1456,6 +1637,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, */ p = &__get_cpu_var(dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; @@ -1465,7 +1647,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(mapping, current->nr_dirtied); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); void throttle_vm_writeout(gfp_t gfp_mask) { @@ -1504,7 +1686,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - bdi_arm_supers_timer(); return 0; } @@ -1568,19 +1749,28 @@ void writeback_set_ratelimit(void) unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); + global_dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +static int +ratelimit_handler(struct notifier_block *self, unsigned long action, + void *hcpu) { - writeback_set_ratelimit(); - return NOTIFY_DONE; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DEAD: + writeback_set_ratelimit(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } } -static struct notifier_block __cpuinitdata ratelimit_nb = { +static struct notifier_block ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; @@ -1605,13 +1795,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { - int shift; - writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); - prop_descriptor_init(&vm_completions, shift); + fprop_global_init(&writeout_completions); } /** @@ -1935,6 +2122,8 @@ int __set_page_dirty_no_writeback(struct page *page) */ void account_page_dirtied(struct page *page, struct address_space *mapping) { + trace_writeback_dirty_page(page, mapping); + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); @@ -1949,11 +2138,17 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for set_page_writeback family. + * + * The caller must hold mem_cgroup_begin/end_update_page_stat() lock + * while calling this function. + * See test_set_page_writeback for example. + * * NOTE: Unlike account_page_dirtied this does not rely on being atomic * wrt interrupts. */ void account_page_writeback(struct page *page) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } EXPORT_SYMBOL(account_page_writeback); @@ -1978,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -1991,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -2170,7 +2366,10 @@ int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2191,9 +2390,11 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } @@ -2201,7 +2402,10 @@ int test_set_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2228,6 +2432,7 @@ int test_set_page_writeback(struct page *page) } if (!ret) account_page_writeback(page); + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } @@ -2242,3 +2447,23 @@ int mapping_tagged(struct address_space *mapping, int tag) return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); + +/** + * wait_for_stable_page() - wait for writeback to finish, if necessary. + * @page: The page to wait on. + * + * This function determines if the given page is related to a backing device + * that requires page contents to be held stable during writeback. If so, then + * it will wait for any pending writeback to complete. + */ +void wait_for_stable_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (!bdi_cap_stable_pages_required(bdi)) + return; + + wait_on_page_writeback(page); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..5dba2933c9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,18 +51,25 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> -#include <linux/memory.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> #include <linux/memcontrol.h> #include <linux/prefetch.h> +#include <linux/mm_inline.h> +#include <linux/migrate.h> #include <linux/page-debug-flags.h> +#include <linux/hugetlb.h> +#include <linux/sched/rt.h> +#include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -90,11 +97,17 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif +#ifdef CONFIG_MOVABLE_NODE + [N_MEMORY] = { { [0] = 1UL } }, +#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -192,6 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes = -1; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -218,10 +232,10 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +void set_pageblock_migratetype(struct page *page, int migratetype) { - - if (unlikely(page_group_by_mobility_disabled)) + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; set_pageblock_flags_group(page, (unsigned long)migratetype, @@ -236,15 +250,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) int ret = 0; unsigned seq; unsigned long pfn = page_to_pfn(page); + unsigned long sp, start_pfn; do { seq = zone_span_seqbegin(zone); - if (pfn >= zone->zone_start_pfn + zone->spanned_pages) - ret = 1; - else if (pfn < zone->zone_start_pfn) + start_pfn = zone->zone_start_pfn; + sp = zone->spanned_pages; + if (!zone_spans_pfn(zone, pfn)) ret = 1; } while (zone_span_seqretry(zone, seq)); + if (ret) + pr_err("page %lu outside zone [ %lu - %lu ]\n", + pfn, start_pfn, start_pfn + sp); + return ret; } @@ -276,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -284,7 +304,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -310,14 +330,14 @@ static void bad_page(struct page *page) printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page(page); + dump_page_badflags(page, reason, bad_flags); print_modules(); dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ - add_taint(TAINT_BAD_PAGE); + page_mapcount_reset(page); /* remove PageBuddy */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* @@ -350,9 +370,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -363,9 +385,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; int bad = 0; - if (unlikely(compound_order(page) != order) || - unlikely(!PageHead(page))) { - bad_page(page); + if (unlikely(compound_order(page) != order)) { + bad_page(page, "wrong compound order", 0); bad++; } @@ -374,8 +395,11 @@ static int destroy_compound_page(struct page *page, unsigned long order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) || (p->first_page != page))) { - bad_page(page); + if (unlikely(!PageTail(p))) { + bad_page(page, "PageTail not set", 0); + bad++; + } else if (unlikely(p->first_page != page)) { + bad_page(page, "first_page not consistent", 0); bad++; } __ClearPageTail(p); @@ -471,8 +495,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount -2. - * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -486,12 +512,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (page_is_guard(buddy) && page_order(buddy) == order) { - VM_BUG_ON(page_count(buddy) != 0); + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { - VM_BUG_ON(page_count(buddy) != 0); + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return 1; } return 0; @@ -510,15 +536,16 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount -2. Page's - * order is recorded in page_private(page) field. + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field. * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. + * triggers coalescing into a block of larger size. * - * -- wli + * -- nyc */ static inline void __free_one_page(struct page *page, @@ -530,6 +557,8 @@ static inline void __free_one_page(struct page *page, unsigned long uninitialized_var(buddy_idx); struct page *buddy; + VM_BUG_ON(!zone_is_initialized(zone)); + if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) return; @@ -538,8 +567,8 @@ static inline void __free_one_page(struct page *page, page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & ((1 << order) - 1)); - VM_BUG_ON(bad_range(zone, page)); + VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); while (order < MAX_ORDER-1) { buddy_idx = __find_buddy_index(page_idx, order); @@ -553,7 +582,8 @@ static inline void __free_one_page(struct page *page, if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); set_page_private(page, 0); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + __mod_zone_freepage_state(zone, 1 << order, + migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -579,7 +609,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -592,27 +622,28 @@ out: zone->free_area[order].nr_free++; } -/* - * free_page_mlock() -- clean up attempts to free and mlocked() page. - * Page should not be on lru, so no need to fix that up. - * free_pages_check() will verify... - */ -static inline void free_page_mlock(struct page *page) -{ - __dec_zone_page_state(page, NR_MLOCK); - __count_vm_event(UNEVICTABLE_MLOCKFREED); -} - static inline int free_pages_check(struct page *page) { - if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | - (mem_cgroup_bad_page_check(page)))) { - bad_page(page); + const char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + } + if (unlikely(mem_cgroup_bad_page_check(page))) + bad_reason = "cgroup check failed"; + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); return 1; } + page_cpupid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -637,7 +668,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int to_free = count; spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (to_free) { @@ -663,15 +693,22 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = to_free; do { + int mt; /* migratetype of the to-be-freed page */ + page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); + mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, page_private(page)); - trace_mm_page_pcpu_drain(page, 0, page_private(page)); + __free_one_page(page, zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + if (likely(!is_migrate_isolate_page(page))) { + __mod_zone_page_state(zone, NR_FREE_PAGES, 1); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); + } } while (--to_free && --batch_free && !list_empty(list)); } - __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -679,11 +716,11 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + if (unlikely(!is_migrate_isolate(migratetype))) + __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -703,7 +740,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) return false; if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } @@ -716,39 +754,57 @@ static bool free_pages_prepare(struct page *page, unsigned int order) static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; - int wasMlocked = __TestClearPageMlocked(page); + int migratetype; if (!free_pages_prepare(page, order)) return; local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, order, - get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_freepage_migratetype(page, migratetype); + free_one_page(page_zone(page), page, order, migratetype); local_irq_restore(flags); } -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; + struct page *p = page; unsigned int loop; - prefetchw(page); - for (loop = 0; loop < nr_pages; loop++) { - struct page *p = &page[loop]; - - if (loop + 1 < nr_pages) - prefetchw(p + 1); + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } + __ClearPageReserved(p); + set_page_count(p, 0); + page_zone(page)->managed_pages += nr_pages; set_page_refcounted(page); __free_pages(page, order); } +#ifdef CONFIG_CMA +/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + adjust_managed_page_count(page, pageblock_nr_pages); +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@ -762,7 +818,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) * large block of memory acted on by a series of small allocations. * This behavior is a critical factor in sglist merging's success. * - * -- wli + * -- nyc */ static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, @@ -774,7 +830,7 @@ static inline void expand(struct zone *zone, struct page *page, area--; high--; size >>= 1; - VM_BUG_ON(bad_range(zone, &page[size])); + VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); #ifdef CONFIG_DEBUG_PAGEALLOC if (high < debug_guardpage_minorder()) { @@ -788,7 +844,8 @@ static inline void expand(struct zone *zone, struct page *page, set_page_guard_flag(&page[size]); set_page_private(&page[size], high); /* Guard pages are not available for any usage */ - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + __mod_zone_freepage_state(zone, -(1 << high), + migratetype); continue; } #endif @@ -803,12 +860,23 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - if (unlikely(page_mapcount(page) | - (page->mapping != NULL) | - (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | - (mem_cgroup_bad_page_check(page)))) { - bad_page(page); + const char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; + bad_flags = PAGE_FLAGS_CHECK_AT_PREP; + } + if (unlikely(mem_cgroup_bad_page_check(page))) + bad_reason = "cgroup check failed"; + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); return 1; } return 0; @@ -848,7 +916,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; - struct free_area * area; + struct free_area *area; struct page *page; /* Find a page of the appropriate size in the preferred list */ @@ -874,11 +942,19 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#else + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#endif + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ +#endif }; /* @@ -886,7 +962,7 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -static int move_freepages(struct zone *zone, +int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, int migratetype) { @@ -907,7 +983,7 @@ static int move_freepages(struct zone *zone, for (page = start_page; page <= end_page;) { /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -922,6 +998,7 @@ static int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); + set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -929,7 +1006,7 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn; @@ -942,9 +1019,9 @@ static int move_freepages_block(struct zone *zone, struct page *page, end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ - if (start_pfn < zone->zone_start_pfn) + if (!zone_spans_pfn(zone, start_pfn)) start_page = page; - if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) + if (!zone_spans_pfn(zone, end_pfn)) return 0; return move_freepages(zone, start_page, end_page, migratetype); @@ -961,24 +1038,74 @@ static void change_pageblock_range(struct page *pageblock_page, } } +/* + * If breaking a large block of pages, move all free pages to the preferred + * allocation list. If falling back for a reclaimable kernel allocation, be + * more aggressive about taking ownership of free pages. + * + * On the other hand, never change migration type of MIGRATE_CMA pageblocks + * nor move CMA pages to different free lists. We don't want unmovable pages + * to be allocated from MIGRATE_CMA areas. + * + * Returns the new migratetype of the pageblock (or the same old migratetype + * if it was unchanged). + */ +static int try_to_steal_freepages(struct zone *zone, struct page *page, + int start_type, int fallback_type) +{ + int current_order = page_order(page); + + /* + * When borrowing from MIGRATE_CMA, we need to release the excess + * buddy pages to CMA itself. + */ + if (is_migrate_cma(fallback_type)) + return fallback_type; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + return start_type; + } + + if (current_order >= pageblock_order / 2 || + start_type == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { + int pages; + + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { + + set_pageblock_migratetype(page, start_type); + return start_type; + } + + } + + return fallback_type; +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { - struct free_area * area; + struct free_area *area; int current_order; struct page *page; - int migratetype, i; + int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) @@ -988,41 +1115,19 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) struct page, lru); area->nr_free--; - /* - * If breaking a large block of pages, move all free - * pages to the preferred allocation list. If falling - * back for a reclaimable kernel allocation, be more - * aggressive about taking ownership of free pages - */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; - pages = move_freepages_block(zone, page, - start_migratetype); - - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, - start_migratetype); - - migratetype = start_migratetype; - } + new_type = try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) - change_pageblock_range(page, current_order, - start_migratetype); - - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + new_type); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + start_migratetype, migratetype, new_type); return page; } @@ -1061,17 +1166,17 @@ retry_reserve: return page; } -/* +/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, +static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int i; - + int mt = migratetype, i; + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); @@ -1091,8 +1196,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (IS_ENABLED(CONFIG_CMA)) { + mt = get_pageblock_migratetype(page); + if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) + mt = migratetype; + } + set_freepage_migratetype(page, mt); list = &page->lru; + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); @@ -1112,14 +1225,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; + if (to_drain > 0) { + free_pcppages_bulk(zone, to_drain, pcp); + pcp->count -= to_drain; + } local_irq_restore(flags); } #endif @@ -1213,12 +1330,12 @@ void mark_free_pages(struct zone *zone) int order, t; struct list_head *curr; - if (!zone->spanned_pages) + if (zone_is_empty(zone)) return; spin_lock_irqsave(&zone->lock, flags); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1250,16 +1367,13 @@ void free_hot_cold_page(struct page *page, int cold) struct per_cpu_pages *pcp; unsigned long flags; int migratetype; - int wasMlocked = __TestClearPageMlocked(page); if (!free_pages_prepare(page, 0)) return; migratetype = get_pageblock_migratetype(page); - set_page_private(page, migratetype); + set_freepage_migratetype(page, migratetype); local_irq_save(flags); - if (unlikely(wasMlocked)) - free_page_mlock(page); __count_vm_event(PGFREE); /* @@ -1270,7 +1384,7 @@ void free_hot_cold_page(struct page *page, int cold) * excessively into the page allocator */ if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(migratetype == MIGRATE_ISOLATE)) { + if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, 0, migratetype); goto out; } @@ -1284,8 +1398,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: @@ -1317,8 +1432,8 @@ void split_page(struct page *page, unsigned int order) { int i; - VM_BUG_ON(PageCompound(page)); - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!page_count(page), page); #ifdef CONFIG_KMEMCHECK /* @@ -1332,6 +1447,46 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } +EXPORT_SYMBOL_GPL(split_page); + +static int __isolate_free_page(struct page *page, unsigned int order) +{ + unsigned long watermark; + struct zone *zone; + int mt; + + BUG_ON(!PageBuddy(page)); + + zone = page_zone(page); + mt = get_pageblock_migratetype(page); + + if (!is_migrate_isolate(mt)) { + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + __mod_zone_freepage_state(zone, -(1UL << order), mt); + } + + /* Remove page from free list */ + list_del(&page->lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + + /* Set the pageblock if the isolated page is at least a pageblock */ + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } + } + + return 1UL << order; +} /* * Similar to split_page except the page is already free. As this is only @@ -1346,36 +1501,18 @@ void split_page(struct page *page, unsigned int order) int split_free_page(struct page *page) { unsigned int order; - unsigned long watermark; - struct zone *zone; - - BUG_ON(!PageBuddy(page)); + int nr_pages; - zone = page_zone(page); order = page_order(page); - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + nr_pages = __isolate_free_page(page, order); + if (!nr_pages) return 0; - /* Remove page from free list */ - list_del(&page->lru); - zone->free_area[order].nr_free--; - rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); - /* Split into individual pages */ set_page_refcounted(page); split_page(page, order); - - if (order >= pageblock_order - 1) { - struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - } - - return 1 << order; + return nr_pages; } /* @@ -1434,14 +1571,17 @@ again: spin_unlock(&zone->lock); if (!page) goto failed; - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); + __mod_zone_freepage_state(zone, -(1 << order), + get_pageblock_migratetype(page)); } + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); - VM_BUG_ON(bad_range(zone, page)); + VM_BUG_ON_PAGE(bad_range(zone, page), page); if (prep_new_page(page, order, gfp_flags)) goto again; return page; @@ -1451,19 +1591,6 @@ failed: return NULL; } -/* The ALLOC_WMARK bits are used as an index to zone->watermark */ -#define ALLOC_WMARK_MIN WMARK_MIN -#define ALLOC_WMARK_LOW WMARK_LOW -#define ALLOC_WMARK_HIGH WMARK_HIGH -#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ - -/* Mask to get the watermark bits */ -#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) - -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct { @@ -1485,16 +1612,16 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) - return 0; + return false; if (gfp_mask & __GFP_NOFAIL) - return 0; + return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return 0; + return false; if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) - return 0; + return false; return should_fail(&fail_page_alloc.attr, 1 << order); } @@ -1534,9 +1661,9 @@ late_initcall(fail_page_alloc_debugfs); #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { - return 0; + return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ @@ -1550,15 +1677,22 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; + long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1600,9 +1734,9 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * comments in mmzone.h. Reduces cache footprint of zonelist scans * that have to skip over a lot of full or unallowed zones. * - * If the zonelist cache is present in the passed in zonelist, then + * If the zonelist cache is present in the passed zonelist, then * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) + * tasks mems_allowed, or node_states[N_MEMORY].) * * If the zonelist cache is not available for this zonelist, does * nothing and returns NULL. @@ -1631,7 +1765,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? &cpuset_current_mems_allowed : - &node_states[N_HIGH_MEMORY]; + &node_states[N_MEMORY]; return allowednodes; } @@ -1709,6 +1843,27 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return local_zone->node == zone->node; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); +} + +static void __paginginit init_zone_allows_reclaim(int nid) +{ + int i; + + for_each_node_state(i, N_MEMORY) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) + node_set(i, NODE_DATA(nid)->reclaim_nodes); + else + zone_reclaim_mode = 1; +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1729,6 +1884,20 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) static void zlc_clear_zones_full(struct zonelist *zonelist) { } + +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static inline void init_zone_allows_reclaim(int nid) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1752,16 +1921,33 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - if (NUMA_BUILD && zlc_active && + unsigned long mark; + + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) + goto try_this_zone; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. + */ + if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(preferred_zone, zone)) + continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; + } /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty @@ -1792,17 +1978,13 @@ zonelist_scan: (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) goto this_zone_full; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - unsigned long mark; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { int ret; - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) - goto try_this_zone; - - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed @@ -1813,14 +1995,15 @@ zonelist_scan: did_zlc_setup = 1; } - if (zone_reclaim_mode == 0) + if (zone_reclaim_mode == 0 || + !zone_allows_reclaim(preferred_zone, zone)) goto this_zone_full; /* * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. */ - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -1834,9 +2017,24 @@ zonelist_scan: continue; default: /* did we reclaim enough */ - if (!zone_watermark_ok(zone, order, mark, + if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) + goto try_this_zone; + + /* + * Failed to reclaim enough to meet watermark. + * Only mark the zone full if checking the min + * watermark or if we failed to reclaim just + * 1<<order pages or else the page allocator + * fastpath will prematurely mark zones full + * when the watermark is between the low and + * min watermarks. + */ + if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || + ret == ZONE_RECLAIM_SOME) goto this_zone_full; + + continue; } } @@ -1846,15 +2044,26 @@ try_this_zone: if (page) break; this_zone_full: - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_mark_zone_full(zonelist, z); } - if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; goto zonelist_scan; } + + if (page) + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is + * that the caller is taking steps that will free more + * memory. The caller should avoid the page being used + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; } @@ -2018,11 +2227,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { - struct page *page; - if (!order) return NULL; @@ -2033,9 +2240,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration); + nodemask, sync_migration, + contended_compaction); current->flags &= ~PF_MEMALLOC; + if (*did_some_progress != COMPACT_SKIPPED) { + struct page *page; /* Page migration frees to the PCP lists but we want merging */ drain_pages(get_cpu()); @@ -2043,13 +2253,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) { - preferred_zone->compact_considered = 0; - preferred_zone->compact_defer_shift = 0; - if (order >= preferred_zone->compact_order_failed) - preferred_zone->compact_order_failed = order + 1; + preferred_zone->compact_blockskip_flush = false; + compaction_defer_reset(preferred_zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } @@ -2079,23 +2287,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int migratetype, bool sync_migration, - bool *deferred_compaction, + bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, + nodemask_t *nodemask) { - struct page *page = NULL; struct reclaim_state reclaim_state; - bool drained = false; + int progress; cond_resched(); @@ -2106,7 +2311,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2114,18 +2319,33 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + nodemask); if (unlikely(!(*did_some_progress))) return NULL; /* After successful reclaim, reconsider all zones for allocation */ - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_clear_zones_full(zonelist); retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2164,16 +2384,38 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx, - enum zone_type classzone_idx) +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + /* + * Only reset the batches of zones that were actually + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (!zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + } +} + +static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order, classzone_idx); + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } static inline int @@ -2209,15 +2451,27 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) + if (gfp_mask & __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } - +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif return alloc_flags; } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2231,6 +2485,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; bool sync_migration = false; bool deferred_compaction = false; + bool contended_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2251,13 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2284,16 +2539,31 @@ rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds + * the allocation is high priority and these type of + * allocations are system rather than user orientated + */ + zonelist = node_zonelist(numa_node_id(), gfp_mask); + page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - if (page) + if (page) { goto got_pg; + } } /* Atomic allocations - we can't balance anything */ - if (!wait) + if (!wait) { + /* + * All existing users of the deprecated __GFP_NOFAIL are + * blockable, so warn of any new users that actually allow this + * type of allocation to fail. + */ + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; + } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) @@ -2312,6 +2582,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2321,10 +2592,11 @@ rebalance: /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller - * has requested the system not be heavily disrupted, fail the - * allocation now instead of entering direct reclaim + * requested a movable allocation that does not heavily disrupt the + * system then fail the allocation instead of entering direct reclaim. */ - if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + if ((deferred_compaction || contended_compaction) && + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ @@ -2341,7 +2613,7 @@ rebalance: * running out of options and have to consider going OOM */ if (!did_some_progress) { - if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_gfp_allowed(gfp_mask)) { if (oom_killer_disabled) goto nopage; /* Coredumps can quickly deplete all memory reserves */ @@ -2395,6 +2667,7 @@ rebalance: nodemask, alloc_flags, preferred_zone, migratetype, sync_migration, + &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -2407,8 +2680,8 @@ nopage: got_pg: if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; + return page; } /* @@ -2423,6 +2696,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2441,8 +2716,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, @@ -2451,14 +2733,42 @@ retry_cpuset: if (!preferred_zone) goto out; +#ifdef CONFIG_CMA + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, + zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); - if (unlikely(!page)) + if (unlikely(!page)) { + /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } + /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. + */ + gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + } trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -2469,9 +2779,11 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2524,6 +2836,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { @@ -2603,18 +2940,27 @@ void free_pages_exact(void *virt, size_t size) } EXPORT_SYMBOL(free_pages_exact); -static unsigned int nr_free_zone_pages(int offset) +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of counts pages which are beyond the + * high watermark within all zones at or below a given zone index. For each + * zone, the number of pages is calculated as: + * managed_pages - high_pages + */ +static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; struct zone *zone; /* Just pick one node, since fallback list is circular */ - unsigned int sum = 0; + unsigned long sum = 0; struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { - unsigned long size = zone->present_pages; + unsigned long size = zone->managed_pages; unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; @@ -2623,26 +2969,32 @@ static unsigned int nr_free_zone_pages(int offset) return sum; } -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL. */ -unsigned int nr_free_buffer_pages(void) +unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); -/* - * Amount of free RAM allocatable within all zones +/** + * nr_free_pagecache_pages - count number of pages beyond high watermark + * + * nr_free_pagecache_pages() counts the number of pages which are beyond the + * high watermark within all zones. */ -unsigned int nr_free_pagecache_pages(void) +unsigned long nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } static inline void show_node(struct zone *zone) { - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } @@ -2662,12 +3014,16 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); #else @@ -2691,15 +3047,42 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } #define K(x) ((x) << (PAGE_SHIFT-10)) +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RESERVE] = 'R', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = 'I', +#endif + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk("(%s) ", tmp); +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -2734,7 +3117,8 @@ void show_free_areas(unsigned int filter) " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -2751,7 +3135,8 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE)); + global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { int i; @@ -2772,6 +3157,7 @@ void show_free_areas(unsigned int filter) " isolated(anon):%lukB" " isolated(file):%lukB" " present:%lukB" + " managed:%lukB" " mlocked:%lukB" " dirty:%lukB" " writeback:%lukB" @@ -2783,6 +3169,7 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2800,6 +3187,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_ISOLATED_ANON)), K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_FILE_DIRTY)), K(zone_page_state(zone, NR_WRITEBACK)), @@ -2812,9 +3200,10 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") + (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -2823,7 +3212,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; @@ -2832,15 +3222,29 @@ void show_free_areas(unsigned int filter) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr[order] = zone->free_area[order].nr_free; + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!list_empty(&area->free_list[type])) + types[order] |= 1 << type; + } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) + for (order = 0; order < MAX_ORDER; order++) { printk("%lu*%lukB ", nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } printk("= %lukB\n", K(total)); } + hugetlb_show_meminfo(); + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); show_swap_cache_info(); @@ -2858,12 +3262,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -2873,8 +3275,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -2958,23 +3360,30 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } @@ -3005,7 +3414,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; - int best_node = -1; + int best_node = NUMA_NO_NODE; const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ @@ -3014,7 +3423,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) return node; } - for_each_node_state(n, N_HIGH_MEMORY) { + for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) @@ -3061,8 +3470,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3076,7 +3484,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3116,11 +3524,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) static int default_zonelist_order(void) { int nid, zone_type; - unsigned long low_kmem_size,total_size; + unsigned long low_kmem_size, total_size; struct zone *z; int average_size; /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. + * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. * This function detect ZONE_DMA/DMA32 size and configures zone order. @@ -3133,8 +3541,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order @@ -3152,11 +3560,11 @@ static int default_zonelist_order(void) return ZONELIST_ORDER_NODE; /* * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ average_size = total_size / - (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); + (nodes_weight(node_states[N_MEMORY]) + 1); for_each_online_node(nid) { low_kmem_size = 0; total_size = 0; @@ -3210,21 +3618,13 @@ static void build_zonelists(pg_data_t *pgdat) j = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - int distance = node_distance(local_node, node); - - /* - * If another node is sufficiently far away then it is better - * to reclaim pages in a zone before going off node. - */ - if (distance > RECLAIM_DISTANCE) - zone_reclaim_mode = 1; - /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (distance != node_distance(local_node, prev_node)) + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) node_load[node] = load; prev_node = node; @@ -3292,7 +3692,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3305,14 +3705,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; @@ -3353,14 +3751,21 @@ static void setup_zone_pageset(struct zone *zone); DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data) { int nid; int cpu; + pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3405,7 +3810,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); @@ -3414,13 +3819,13 @@ void __ref build_all_zonelists(void *data) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG - if (data) - setup_zone_pageset((struct zone *)data); + if (zone) + setup_zone_pageset(zone); #endif - stop_machine(__build_all_zonelists, NULL, NULL); + /* we have to stop all cpus to guarantee there is no user + of zonelist */ + stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3513,8 +3918,6 @@ static inline unsigned long wait_table_bits(unsigned long size) return ffz(~size); } -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - /* * Check if a pageblock contains reserved pages */ @@ -3542,6 +3945,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) struct page *page; unsigned long block_migratetype; int reserve; + int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve @@ -3550,7 +3954,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) * the block. */ start_pfn = zone->zone_start_pfn; - end_pfn = start_pfn + zone->spanned_pages; + end_pfn = zone_end_pfn(zone); start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; @@ -3563,6 +3967,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) * future allocation of hugepages at runtime. */ reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) @@ -3600,6 +4010,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve--; continue; } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; } /* @@ -3646,7 +4062,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -3663,7 +4080,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * pfn out of zone. */ if ((z->zone_start_pfn <= pfn) - && (pfn < z->zone_start_pfn + z->spanned_pages) + && (pfn < zone_end_pfn(z)) && !(pfn & (pageblock_nr_pages - 1))) set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -3690,7 +4107,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -3701,7 +4118,7 @@ static int zone_batchsize(struct zone *zone) * * OK, so we don't know how big the cache is. So guess. */ - batch = zone->present_pages / 1024; + batch = zone->managed_pages / 1024; if (batch * PAGE_SIZE > 512 * 1024) batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ @@ -3740,7 +4157,40 @@ static int zone_batchsize(struct zone *zone) #endif } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -3749,45 +4199,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } -static void setup_zone_pageset(struct zone *zone) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - int cpu; - - zone->pageset = alloc_percpu(struct per_cpu_pageset); + if (percpu_pagelist_fraction) + pageset_set_high(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - setup_pageset(pcp, zone_batchsize(zone)); + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->present_pages / - percpu_pagelist_fraction)); - } +static void __meminit setup_zone_pageset(struct zone *zone) +{ + int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* @@ -3806,7 +4266,6 @@ static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; - struct pglist_data *pgdat = zone->zone_pgdat; size_t alloc_size; /* @@ -3822,7 +4281,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node_nopanic(pgdat, alloc_size); + memblock_virt_alloc_node_nopanic( + alloc_size, zone->zone_pgdat->node_id); } else { /* * This case means that a zone whose size was 0 gets new memory @@ -3839,38 +4299,12 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!zone->wait_table) return -ENOMEM; - for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) + for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); return 0; } -static int __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - -void zone_pcp_update(struct zone *zone) -{ - stop_machine(__zone_pcp_update, zone, NULL); -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -3880,13 +4314,13 @@ static __meminit void zone_pcp_init(struct zone *zone) */ zone->pageset = &boot_pageset; - if (zone->present_pages) + if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", zone->name, zone->present_pages, zone_batchsize(zone)); } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size, enum memmap_context context) @@ -3922,13 +4356,25 @@ __meminit int init_currently_empty_zone(struct zone *zone, int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int nid; + /* + * NOTE: The following SMP-unsafe globals are only used early in boot + * when the kernel is running single-threaded. + */ + static unsigned long __meminitdata last_start_pfn, last_end_pfn; + static int __meminitdata last_nid; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) - return nid; - /* This is a memory hole */ - return -1; + if (last_start_pfn <= pfn && pfn < last_end_pfn) + return last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != -1) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; + } + + return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ @@ -3956,13 +4402,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) #endif /** - * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling free_bootmem() manually. + * this function may be used instead of calling memblock_free_early_nid() + * manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -3974,9 +4421,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) end_pfn = min(end_pfn, max_low_pfn); if (start_pfn < end_pfn) - free_bootmem_node(NODE_DATA(this_nid), - PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT); + memblock_free_early_nid(PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT, + this_nid); } } @@ -4089,13 +4536,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4150,14 +4597,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4170,6 +4617,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4177,6 +4626,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4188,21 +4639,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4216,10 +4673,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, * round what is now in bits to nearest long in bits, then return it in * bytes. */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@ -4229,40 +4687,42 @@ static unsigned long __init usemap_size(unsigned long zonesize) } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) - zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, - usemapsize); + zone->pageblock_flags = + memblock_virt_alloc_node_nopanic(usemapsize, + pgdat->node_id); } #else -static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +void __paginginit set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4270,25 +4730,46 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +void __paginginit set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ +static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) +{ + unsigned long pages = spanned_pages; + + /* + * Provide a more accurate estimation if there are holes within + * the zone and SPARSEMEM is in use. If there are holes within the + * zone, each populated memory region may cost us one or two extra + * memmap pages due to alignment because memmap pages for each + * populated regions may not naturally algined on page boundary. + * So the (present_pages >> 4) heuristic is a tradeoff for that. + */ + if (spanned_pages > present_pages + (present_pages >> 4) && + IS_ENABLED(CONFIG_SPARSEMEM)) + pages = present_pages; + + return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} + /* * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4297,77 +4778,87 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); - pgdat->nr_zones = 0; +#ifdef CONFIG_NUMA_BALANCING + spin_lock_init(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies; +#endif init_waitqueue_head(&pgdat->kswapd_wait); - pgdat->kswapd_max_order = 0; + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, memmap_pages; - enum lru_list lru; + unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); - realsize = size - zone_absent_pages_in_node(nid, j, + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); + realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* - * Adjust realsize so that it accounts for how much memory + * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = - PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; - if (realsize >= memmap_pages) { - realsize -= memmap_pages; + memmap_pages = calc_memmap_size(size, realsize); + if (freesize >= memmap_pages) { + freesize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING - " %s zone: %lu pages exceeds realsize %lu\n", - zone_names[j], memmap_pages, realsize); + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); /* Account for reserved pages */ - if (j == 0 && realsize > dma_reserve) { - realsize -= dma_reserve; + if (j == 0 && freesize > dma_reserve) { + freesize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) - nr_kernel_pages += realsize; - nr_all_pages += realsize; + nr_kernel_pages += freesize; + /* Charge for highmem memmap if there are enough kernel pages */ + else if (nr_kernel_pages > memmap_pages * 2) + nr_kernel_pages -= memmap_pages; + nr_all_pages += freesize; zone->spanned_pages = size; zone->present_pages = realsize; + /* + * Set an approximate value for lowmem here, it will be adjusted + * when the bootmem allocator frees pages into the buddy system. + * And all highmem pages will be managed by the buddy system. + */ + zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) + zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone_pcp_init(zone); - for_each_lru(lru) - INIT_LIST_HEAD(&zone->lruvec.lists[lru]); - zone->reclaim_stat.recent_rotated[0] = 0; - zone->reclaim_stat.recent_rotated[1] = 0; - zone->reclaim_stat.recent_scanned[0] = 0; - zone->reclaim_stat.recent_scanned[1] = 0; - zap_zone_vm_stats(zone); - zone->flags = 0; + + /* For bootup, initialized properly in watermark setup */ + mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + + lruvec_init(&zone->lruvec); if (!size) continue; - set_pageblock_order(pageblock_default_order()); - setup_usemap(pgdat, zone, size); + set_pageblock_order(); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@ -4394,12 +4885,13 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); - end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) - map = alloc_bootmem_node_nopanic(pgdat, size); + map = memblock_virt_alloc_node_nopanic(size, + pgdat->node_id); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -4421,10 +4913,21 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; + + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - calculate_node_totalpages(pgdat, zones_size, zholes_size); + if (node_state(nid, N_MEMORY)) + init_zone_allows_reclaim(nid); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4433,7 +4936,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -4442,7 +4946,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* * Figure out the number of possible node ids. */ -static void __init setup_nr_node_ids(void) +void __init setup_nr_node_ids(void) { unsigned int node; unsigned int highest = 0; @@ -4451,10 +4955,6 @@ static void __init setup_nr_node_ids(void) highest = node; nr_node_ids = highest + 1; } -#else -static inline void setup_nr_node_ids(void) -{ -} #endif /** @@ -4540,7 +5040,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. - * Populate N_HIGH_MEMORY for calculating usable_nodes. + * Populate N_MEMORY for calculating usable_nodes. */ static unsigned long __init early_calculate_totalpages(void) { @@ -4553,9 +5053,9 @@ static unsigned long __init early_calculate_totalpages(void) totalpages += pages; if (pages) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(nid, N_MEMORY); } - return totalpages; + return totalpages; } /* @@ -4570,12 +5070,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; /* save the state before borrow the nodemask */ - nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; + nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); - int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_region *r; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) + continue; + + nid = r->nid; + + usable_startpfn = PFN_DOWN(r->base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } /* - * If movablecore was specified, calculate what size of + * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -4601,13 +5125,12 @@ static void __init find_zone_movable_pfns_for_nodes(void) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ - find_usable_zone_for_movable(); usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; /* @@ -4672,7 +5195,7 @@ restart: /* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been - * satisified + * satisfied */ required_kernelcore -= min(required_kernelcore, size_pages); @@ -4686,12 +5209,13 @@ restart: * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is - * satisified + * satisfied */ usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) goto restart; +out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = @@ -4699,23 +5223,27 @@ restart: out: /* restore the node_state */ - node_states[N_HIGH_MEMORY] = saved_node_state; + node_states[N_MEMORY] = saved_node_state; } -/* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid) { -#ifdef CONFIG_HIGHMEM enum zone_type zone_type; - for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + if (N_MEMORY == N_NORMAL_MEMORY) + return; + + for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) { - node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + if (populated_zone(zone)) { + node_set_state(nid, N_HIGH_MEMORY); + if (N_NORMAL_MEMORY != N_HIGH_MEMORY && + zone_type <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); break; } } -#endif } /** @@ -4759,31 +5287,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone PFN ranges:\n"); + printk("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s ", zone_names[i]); + printk(KERN_CONT " %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) - printk("empty\n"); + printk(KERN_CONT "empty\n"); else - printk("%0#10lx -> %0#10lx\n", - arch_zone_lowest_possible_pfn[i], - arch_zone_highest_possible_pfn[i]); + printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, + (arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start PFN for each node\n"); + printk("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + printk(" Node %d: %#010lx\n", i, + zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early_node_map[] */ - printk("Early memory PFN ranges\n"); + /* Print out the early node map */ + printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); + printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4795,8 +5326,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Any memory on that node */ if (pgdat->node_present_pages) - node_set_state(nid, N_HIGH_MEMORY); - check_for_regular_memory(pgdat); + node_set_state(nid, N_MEMORY); + check_for_memory(pgdat, nid); } } @@ -4838,6 +5369,103 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif + spin_unlock(&managed_page_count_lock); +} +EXPORT_SYMBOL(adjust_managed_page_count); + +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + if ((unsigned int)poison <= 0xFF) + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK (%p - %p)\n", + s, pages << (PAGE_SHIFT - 10), start, end); + + return pages; +} +EXPORT_SYMBOL(free_reserved_area); + +#ifdef CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ + __free_reserved_page(page); + totalram_pages++; + page_zone(page)->managed_pages++; + totalhigh_pages++; +} +#endif + + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + do { \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; \ + } while (0) + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved @@ -4884,7 +5512,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, * This is only okay since the processor is dead and cannot * race with what we are doing. */ - refresh_cpu_vm_stats(cpu); + cpu_vm_stats_fold(cpu); } return NOTIFY_OK; } @@ -4918,8 +5546,8 @@ static void calculate_totalreserve_pages(void) /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > zone->present_pages) - max = zone->present_pages; + if (max > zone->managed_pages) + max = zone->managed_pages; reserve_pages += max; /* * Lowmem reserves are not available to @@ -4951,7 +5579,7 @@ static void setup_per_zone_lowmem_reserve(void) for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long present_pages = zone->present_pages; + unsigned long managed_pages = zone->managed_pages; zone->lowmem_reserve[j] = 0; @@ -4965,9 +5593,9 @@ static void setup_per_zone_lowmem_reserve(void) sysctl_lowmem_reserve_ratio[idx] = 1; lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = present_pages / + lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; - present_pages += lower_zone->present_pages; + managed_pages += lower_zone->managed_pages; } } } @@ -4976,14 +5604,7 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4993,14 +5614,14 @@ void setup_per_zone_wmarks(void) /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) - lowmem_pages += zone->present_pages; + lowmem_pages += zone->managed_pages; } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->present_pages; + tmp = (u64)pages_min * zone->managed_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* @@ -5012,13 +5633,10 @@ void setup_per_zone_wmarks(void) * deltas controls asynch page reclaim, and so should * not be capped for highmem. */ - int min_pages; + unsigned long min_pages; - min_pages = zone->present_pages / 1024; - if (min_pages < SWAP_CLUSTER_MAX) - min_pages = SWAP_CLUSTER_MAX; - if (min_pages > 128) - min_pages = 128; + min_pages = zone->managed_pages / 1024; + min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { /* @@ -5030,6 +5648,12 @@ void setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5038,6 +5662,20 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance @@ -5064,7 +5702,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) unsigned int gb, ratio; /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + gb = zone->managed_pages >> (30 - PAGE_SHIFT); if (gb) ratio = int_sqrt(10 * gb); else @@ -5088,7 +5726,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * - * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields @@ -5108,14 +5746,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5125,16 +5770,23 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); - if (write) + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } @@ -5150,7 +5802,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_unmapped_pages = (zone->present_pages * + zone->min_unmapped_pages = (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } @@ -5166,7 +5818,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_slab_pages = (zone->present_pages * + zone->min_slab_pages = (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } @@ -5191,10 +5843,9 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist - * can have before it gets flushed back to buddy allocator. + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5205,14 +5856,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -5242,9 +5895,10 @@ void *__init alloc_large_system_hash(const char *tablename, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, - unsigned long limit) + unsigned long low_limit, + unsigned long high_limit) { - unsigned long long max = limit; + unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; @@ -5252,9 +5906,10 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries += (1UL << (20 - PAGE_SHIFT)) - 1; - numentries >>= 20 - PAGE_SHIFT; - numentries <<= 20 - PAGE_SHIFT; + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SHIFT < 20) + numentries = round_up(numentries, (1<<20)/PAGE_SIZE); /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) @@ -5282,6 +5937,8 @@ void *__init alloc_large_system_hash(const char *tablename, } max = min(max, 0x80000000ULL); + if (numentries < low_limit) + numentries = low_limit; if (numentries > max) numentries = max; @@ -5290,7 +5947,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem_nopanic(size); + table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { @@ -5340,7 +5997,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -5392,8 +6049,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); - VM_BUG_ON(pfn < zone->zone_start_pfn); - VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) if (flags & value) @@ -5403,24 +6059,28 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, } /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check without isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact. */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; + int mt; + /* * For avoiding noise data, lru_add_drain_all() should be called - * If ZONE_MOVABLE, the zone never contains immobile pages + * If ZONE_MOVABLE, the zone never contains unmovable pages */ if (zone_idx(zone) == ZONE_MOVABLE) - return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) - return true; + return false; + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { @@ -5430,11 +6090,36 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) continue; page = pfn_to_page(check); - if (!page_count(page)) { + + /* + * Hugepages are not in LRU lists, but they're movable. + * We need not scan over tail pages bacause we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page)) { + iter = round_up(iter + 1, 1<<compound_order(page)) - 1; + continue; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_count is zero at all time. + */ + if (!atomic_read(&page->_count)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (skip_hwpoisoned_pages && PageHWPoison(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -5451,9 +6136,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) * page at boot. */ if (found > count) - return false; + return true; } - return true; + return false; } bool is_pageblock_removable_nolock(struct page *page) @@ -5473,81 +6158,246 @@ bool is_pageblock_removable_nolock(struct page *page) zone = page_zone(page); pfn = page_to_pfn(page); - if (zone->zone_start_pfn > pfn || - zone->zone_start_pfn + zone->spanned_pages <= pfn) + if (!zone_spans_pfn(zone, pfn)) return false; - return __count_immobile_pages(zone, page, 0); + return !has_unmovable_pages(zone, page, 0, true); } -int set_migratetype_isolate(struct page *page) +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) { - struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; - int ret = -EBUSY; + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} - zone = page_zone(page); +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} - spin_lock_irqsave(&zone->lock, flags); +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + unsigned long nr_reclaimed; + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; + migrate_prep(); + + while (pfn < end || !list_empty(&cc->migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + pfn = isolate_migratepages_range(cc->zone, cc, + pfn, end, true); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; + + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, + 0, MIGRATE_SYNC, MR_CMA); + } + if (ret < 0) { + putback_movable_pages(&cc->migratepages); + return ret; + } + return 0; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + unsigned long outer_start, outer_end; + int ret = 0, order; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. */ - if (__count_immobile_pages(zone, page, arg.pages_found)) - ret = 0; + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype, + false); + if (ret) + return ret; + + ret = __alloc_contig_migrate_range(&cc, start, end); + if (ret) + goto done; /* - * immobile means "not-on-lru" paes. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. */ -out: - if (!ret) { - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); + lru_add_drain_all(); + drain_all_pages(); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; } - spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) - drain_all_pages(); + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end, false)) { + pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", + outer_start, end); + ret = -EBUSY; + goto done; + } + + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(&cc, outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); return ret; } -void unset_migratetype_isolate(struct page *page) +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + unsigned int count = 0; + + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + count += page_count(page) != 1; + __free_page(page); + } + WARN(count != 0, "%d pages are still in use!\n", count); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ +void __meminit zone_pcp_update(struct zone *zone) +{ + unsigned cpu; + mutex_lock(&pcp_batch_high_lock); + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); +} +#endif + +void zone_pcp_reset(struct zone *zone) { - struct zone *zone; unsigned long flags; - zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); -out: - spin_unlock_irqrestore(&zone->lock, flags); + int cpu; + struct per_cpu_pageset *pset; + + /* avoid races with drain_pages() */ + local_irq_save(flags); + if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); + drain_zonestat(zone, pset); + } + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } + local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -5577,6 +6427,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { + pfn++; + SetPageReserved(page); + continue; + } + BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); @@ -5587,8 +6447,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - - (1UL << order)); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); @@ -5618,7 +6476,7 @@ bool is_free_buddy_page(struct page *page) } #endif -static struct trace_print_flags pageflag_names[] = { +static const struct trace_print_flags pageflag_names[] = { {1UL << PG_locked, "locked" }, {1UL << PG_error, "error" }, {1UL << PG_referenced, "referenced" }, @@ -5653,7 +6511,9 @@ static struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, #endif - {-1UL, NULL }, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif }; static void dump_page_flags(unsigned long flags) @@ -5662,12 +6522,14 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ flags &= (1UL << NR_PAGEFLAGS) - 1; - for (i = 0; pageflag_names[i].name && flags; i++) { + for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { mask = pageflag_names[i].mask; if ((flags & mask) != mask) @@ -5685,12 +6547,25 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page(struct page *page) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_page_flags(page->flags & badflags); + } mem_cgroup_print_bad_page(page); } + +void dump_page(struct page *page, const char *reason) +{ + dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..3708264d2833 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) table_size = sizeof(struct page_cgroup) * nr_pages; - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_cgroup = base; @@ -174,7 +175,7 @@ static void free_page_cgroup(void *addr) } } -void __free_page_cgroup(unsigned long pfn) +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -187,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn) ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; @@ -222,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; @@ -251,6 +252,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, mn->nr_pages, mn->status_change_nid); break; case MEM_CANCEL_ONLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: @@ -271,7 +275,7 @@ void __init page_cgroup_init(void) if (mem_cgroup_disabled()) return; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); @@ -317,7 +321,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { @@ -392,7 +396,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @end: swap entry to be cmpxchged + * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * @@ -422,7 +426,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into - * @mem: mem_cgroup to be recorded + * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) @@ -448,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..7c59ef681381 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -17,7 +17,11 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/swapops.h> +#include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/frontswap.h> +#include <linux/aio.h> +#include <linux/blkdev.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -27,20 +31,19 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio = bio_alloc(gfp_flags, 1); if (bio) { - bio->bi_sector = map_swap_page(page, &bio->bi_bdev); - bio->bi_sector <<= PAGE_SHIFT - 9; + bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_idx = 0; - bio->bi_size = PAGE_SIZE; + bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } return bio; } -static void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -59,7 +62,7 @@ static void end_swap_bio_write(struct bio *bio, int err) printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -77,28 +80,224 @@ void end_swap_bio_read(struct bio *bio, int err) printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_sector); - } else { - SetPageUptodate(page); + (unsigned long long)bio->bi_iter.bi_sector); + goto out; + } + + SetPageUptodate(page); + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (likely(PageSwapCache(page))) { + struct swap_info_struct *sis; + + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } } + +out: unlock_page(page); bio_put(bio); } +int generic_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + sector_t lowest_block = -1; + sector_t highest_block = 0; + int nr_extents = 0; + int ret; + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + first_block >>= (PAGE_SHIFT - blkbits); + if (page_no) { /* exclude the header page */ + if (first_block < lowest_block) + lowest_block = first_block; + if (first_block > highest_block) + highest_block = first_block; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; + goto out; +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. */ int swap_writepage(struct page *page, struct writeback_control *wbc) { - struct bio *bio; - int ret = 0, rw = WRITE; + int ret = 0; if (try_to_free_swap(page)) { unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); + if (frontswap_store(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } + ret = __swap_writepage(page, wbc, end_swap_bio_write); +out: + return ret; +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc, + void (*end_write_func)(struct bio *, int)) +{ + struct bio *bio; + int ret = 0, rw = WRITE; + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct kiocb kiocb; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iovec iov = { + .iov_base = kmap(page), + .iov_len = PAGE_SIZE, + }; + + init_sync_kiocb(&kiocb, swap_file); + kiocb.ki_pos = page_file_offset(page); + kiocb.ki_nbytes = PAGE_SIZE; + + set_page_writeback(page); + unlock_page(page); + ret = mapping->a_ops->direct_IO(KERNEL_WRITE, + &kiocb, &iov, + kiocb.ki_pos, 1); + kunmap(page); + if (ret == PAGE_SIZE) { + count_vm_event(PSWPOUT); + ret = 0; + } else { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * rotate_reclaimable_page but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + set_page_dirty(page); + ClearPageReclaim(page); + pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", + page_file_offset(page)); + } + end_page_writeback(page); + return ret; + } + + bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -119,9 +318,26 @@ int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + ret = mapping->a_ops->readpage(swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageUptodate(page)); bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); @@ -133,3 +349,15 @@ int swap_readpage(struct page *page) out: return ret; } + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct address_space *mapping = sis->swap_file->f_mapping; + return mapping->a_ops->set_page_dirty(page); + } else { + return __set_page_dirty_no_writeback(page); + } +} diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4ae42bb40892..d1473b2e9481 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -5,8 +5,88 @@ #include <linux/mm.h> #include <linux/page-isolation.h> #include <linux/pageblock-flags.h> +#include <linux/memory.h> +#include <linux/hugetlb.h> #include "internal.h" +int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) +{ + struct zone *zone; + unsigned long flags, pfn; + struct memory_isolate_notify arg; + int notifier_ret; + int ret = -EBUSY; + + zone = page_zone(page); + + spin_lock_irqsave(&zone->lock, flags); + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + + /* + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. + */ + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret) + goto out; + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (!has_unmovable_pages(zone, page, arg.pages_found, + skip_hwpoisoned_pages)) + ret = 0; + + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + +out: + if (!ret) { + unsigned long nr_pages; + int migratetype = get_pageblock_migratetype(page); + + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + + __mod_zone_freepage_state(zone, -nr_pages, migratetype); + } + + spin_unlock_irqrestore(&zone->lock, flags); + if (!ret) + drain_all_pages(); + return ret; +} + +void unset_migratetype_isolate(struct page *page, unsigned migratetype) +{ + struct zone *zone; + unsigned long flags, nr_pages; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + goto out; + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + set_pageblock_migratetype(page, migratetype); +out: + spin_unlock_irqrestore(&zone->lock, flags); +} + static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { @@ -24,6 +104,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * to be MIGRATE_ISOLATE. * @start_pfn: The lower PFN of the range to be isolated. * @end_pfn: The upper PFN of the range to be isolated. + * @migratetype: migrate type to set in error recovery. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -32,8 +113,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * start_pfn/end_pfn must be aligned to pageblock_order. * Returns 0 on success and -EBUSY if any part of range cannot be isolated. */ -int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn; unsigned long undo_pfn; @@ -46,7 +127,8 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page)) { + if (page && + set_migratetype_isolate(page, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } @@ -56,7 +138,7 @@ undo: for (pfn = start_pfn; pfn < undo_pfn; pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn)); + unset_migratetype_isolate(pfn_to_page(pfn), migratetype); return -EBUSY; } @@ -64,8 +146,8 @@ undo: /* * Make isolated pages available again. */ -int -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) { unsigned long pfn; struct page *page; @@ -77,7 +159,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) page = __first_valid_page(pfn, pageblock_nr_pages); if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) continue; - unset_migratetype_isolate(page); + unset_migratetype_isolate(page, migratetype); } return 0; } @@ -86,10 +168,11 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 1 if all pages in the range is isolated. + * Returns 1 if all pages in the range are isolated. */ static int -__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { struct page *page; @@ -99,11 +182,34 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) + if (PageBuddy(page)) { + /* + * If race between isolatation and allocation happens, + * some free pages could be in MIGRATE_MOVABLE list + * although pageblock's migratation type of the page + * is MIGRATE_ISOLATE. Catch it and move the page into + * MIGRATE_ISOLATE list. + */ + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { + struct page *end_page; + + end_page = page + (1 << page_order(page)) - 1; + move_freepages(page_zone(page), page, end_page, + MIGRATE_ISOLATE); + } pfn += 1 << page_order(page); + } else if (page_count(page) == 0 && - page_private(page) == MIGRATE_ISOLATE) + get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; + else if (skip_hwpoisoned_pages && PageHWPoison(page)) { + /* + * The HWPoisoned page may be not in buddy + * system, and page_count() is not 0. + */ + pfn++; + continue; + } else break; } @@ -112,7 +218,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) return 1; } -int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { unsigned long pfn, flags; struct page *page; @@ -120,9 +227,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) int ret; /* - * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page - * is not aligned to pageblock_nr_pages. - * Then we just check pagetype fist. + * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages + * are not aligned to pageblock_nr_pages. + * Then we just check migratetype first. */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); @@ -132,10 +239,35 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) page = __first_valid_page(start_pfn, end_pfn - start_pfn); if ((pfn < end_pfn) || !page) return -EBUSY; - /* Check all pages are free or Marked as ISOLATED */ + /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } + +struct page *alloc_migrate_target(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) { + nodemask_t src = nodemask_of_node(page_to_nid(page)); + nodemask_t dst; + nodes_complement(dst, src); + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node(page_to_nid(page), dst)); + } + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..2beeabf502c5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd(walk->mm, pmd); + split_huge_page_pmd_mm(walk->mm, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); @@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - return NULL; -} - static int walk_hugetlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -162,7 +141,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, /** * walk_page_range - walk a memory map's page tables with a callback - * @mm: memory map to walk * @addr: starting address * @end: ending address * @walk: set of callbacks to invoke for each level of the tree @@ -199,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* - * handle hugetlb vma individually because pagetable walk for - * the hugetlb page is dependent on the architecture and - * we can't handled it in the same manner as non-huge pages. + * This function was not intended to be vma based. + * But there are vma special cases to be handled: + * - hugetlb vma's + * - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* + * There are no page structures backing a VM_PFNMAP + * range, so do not allow split_huge_page_pmd(). + */ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* - * Hugepage is very tightly coupled with vma, so - * walk through hugetlb entries within a given vma. + * Handle hugetlb vma individually because pagetable + * walk for the hugetlb page is dependent on the + * architecture and we can't handled it in the same + * manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, + * so walk through hugetlb entries within a + * given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { @@ -241,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (err) break; pgd++; - } while (addr = next, addr != end); + } while (addr = next, addr < end); return err; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -360,7 +360,6 @@ err_free: * @chunk: chunk to depopulate * @off: offset to the area to depopulate * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping diff --git a/mm/percpu.c b/mm/percpu.c index bb4be7435ce3..2ddf9a990dbd 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -102,10 +102,11 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ void *data; /* chunk data */ + int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - if (chunk->map_alloc >= chunk->map_used + 2) + if (chunk->map_alloc >= chunk->map_used + 3) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) + while (new_alloc < chunk->map_used + 3) new_alloc *= 2; return new_alloc; @@ -418,48 +419,6 @@ out_unlock: } /** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * @chunk->map must have enough free slots to accommodate the split. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_split_block(struct pcpu_chunk *chunk, int i, - int head, int tail) -{ - int nr_extra = !!head + !!tail; - - BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - - /* insert new subblocks */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } -} - -/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes @@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; + bool seen_free = false; + int *p; - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; /* extra for alignment requirement */ head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); + this_size = (p[1] & ~1) - off; + if (this_size < head + size) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); continue; } @@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; head = 0; } /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) + tail = this_size - head - size; + if (tail < sizeof(int)) { tail = 0; + size = this_size - head; + } /* split if warranted */ if (head || tail) { - pcpu_split_block(chunk, i, head, tail); + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); } + if (!seen_free) + chunk->first_free = i + 1; + /* update hint and mark allocated */ - if (is_last) + if (i + 1 == chunk->map_used) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; + chunk->free_size -= size; + *p |= 1; pcpu_chunk_relocate(chunk, oslot); return off; @@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for <given offset, in use> pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; + if (i < chunk->first_free) + chunk->first_free = i; + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); + if (i > 0 && !(p[-1] & 1)) { + to_free++; i--; + p--; } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); } - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } @@ -612,12 +610,14 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); return NULL; } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; @@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) if (!chunk) return; pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - kfree(chunk); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); } /* @@ -713,6 +713,16 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) unsigned long flags; void __percpu *ptr; + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + if (unlikely(size & 1)) + size++; + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -1063,7 +1073,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); - ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); if (!ptr) return NULL; ai = ptr; @@ -1088,7 +1098,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - free_bootmem(__pa(ai), ai->__ai_size); + memblock_free_early(__pa(ai), ai->__ai_size); } /** @@ -1246,10 +1256,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); - group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); - unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + group_offsets = memblock_virt_alloc(ai->nr_groups * + sizeof(group_offsets[0]), 0); + group_sizes = memblock_virt_alloc(ai->nr_groups * + sizeof(group_sizes[0]), 0); + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -1311,7 +1323,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + pcpu_slot = memblock_virt_alloc( + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); @@ -1322,7 +1335,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * covers static area + reserved area (mostly used for module * static percpu allocation). */ - schunk = alloc_bootmem(pcpu_chunk_struct_size); + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); schunk->base_addr = base_addr; schunk->map = smap; @@ -1340,13 +1353,17 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -ai->static_size; + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; if (schunk->free_size) - schunk->map[schunk->map_used++] = schunk->free_size; + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(pcpu_chunk_struct_size); + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); dchunk->base_addr = base_addr; dchunk->map = dmap; @@ -1355,8 +1372,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; - dchunk->map[dchunk->map_used++] = dchunk->free_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; } /* link the first chunk in */ @@ -1370,7 +1389,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #ifdef CONFIG_SMP -const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { +const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", @@ -1380,6 +1399,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { + if (!str) + return -EINVAL; + if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK @@ -1623,7 +1645,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - areas = alloc_bootmem_nopanic(areas_size); + areas = memblock_virt_alloc_nopanic(areas_size, 0); if (!areas) { rc = -ENOMEM; goto out_free; @@ -1683,10 +1705,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, max_distance += ai->unit_size; /* warn if maximum distance is further than 75% of vmalloc space */ - if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " "space 0x%lx\n", max_distance, - (unsigned long)(VMALLOC_END - VMALLOC_START)); + VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -1703,12 +1725,13 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) - free_fn(areas[group], - ai->groups[group].nr_units * ai->unit_size); + if (areas[group]) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) - free_bootmem(__pa(areas), areas_size); + memblock_free_early(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ @@ -1756,7 +1779,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = alloc_bootmem(pages_size); + pages = memblock_virt_alloc(pages_size, 0); /* allocate pages */ j = 0; @@ -1819,7 +1842,7 @@ enomem: free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: - free_bootmem(__pa(pages), pages_size); + memblock_free_early(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } @@ -1844,12 +1867,13 @@ EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); + return memblock_virt_alloc_from_nopanic( + size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { - free_bootmem(__pa(ptr), size); + memblock_free_early(__pa(ptr), size); } void __init setup_per_cpu_areas(void) @@ -1892,7 +1916,9 @@ void __init setup_per_cpu_areas(void) void *fc; ai = pcpu_alloc_alloc_info(1, 1); - fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + fc = memblock_virt_alloc_from_nopanic(unit_size, + PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5a74fea182f1..a8b919925934 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,10 +10,34 @@ #include <asm/tlb.h> #include <asm-generic/pgtable.h> +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* - * Only sets the access flags (dirty, accessed, and - * writable). Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed), as well as write + * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This @@ -27,7 +51,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } return changed; } @@ -86,9 +110,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + struct mm_struct *mm = (vma)->vm_mm; pte_t pte; - pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - flush_tlb_page(vma, address); + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) + flush_tlb_page(vma, address); return pte; } #endif @@ -109,8 +135,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd = pmd_mksplitting(*pmdp); VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -120,3 +146,57 @@ pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + if (list_empty(&pgtable->lru)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t entry = *pmdp; + if (pmd_numa(entry)) + entry = pmd_mknonnuma(entry); + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include <linux/mm.h> -#include <linux/prio_tree.h> -#include <linux/prefetch.h> - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c20ff48994c2..8505c9262b35 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -23,129 +23,44 @@ /** * process_vm_rw_pages - read/write pages from task specified - * @task: task to read/write from - * @mm: mm for task - * @process_pages: struct pages area that can store at least - * nr_pages_to_copy struct page pointers - * @pa: address of page in task to start copying from/to + * @pages: array of pointers to pages we want to copy * @start_offset: offset in page to start copying from/to * @len: number of bytes to copy - * @lvec: iovec array specifying where to copy to/from - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to - * @nr_pages_to_copy: number of pages to copy - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ -static int process_vm_rw_pages(struct task_struct *task, - struct mm_struct *mm, - struct page **process_pages, - unsigned long pa, - unsigned long start_offset, - unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, - int vm_write, - unsigned int nr_pages_to_copy, - ssize_t *bytes_copied) +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) { - int pages_pinned; - void *target_kaddr; - int pgs_copied = 0; - int j; - int ret; - ssize_t bytes_to_copy; - ssize_t rc = 0; - - *bytes_copied = 0; - - /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - - if (pages_pinned != nr_pages_to_copy) { - rc = -EFAULT; - goto end; - } - /* Do the copy for each page */ - for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); - pgs_copied++) { - /* Make sure we have a non zero length iovec */ - while (*lvec_current < lvec_cnt - && lvec[*lvec_current].iov_len == 0) - (*lvec_current)++; - if (*lvec_current == lvec_cnt) - break; - - /* - * Will copy smallest of: - * - bytes remaining in page - * - bytes remaining in destination iovec - */ - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, - len - *bytes_copied); - bytes_to_copy = min_t(ssize_t, bytes_to_copy, - lvec[*lvec_current].iov_len - - *lvec_offset); - - target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; - - if (vm_write) - ret = copy_from_user(target_kaddr, - lvec[*lvec_current].iov_base - + *lvec_offset, - bytes_to_copy); - else - ret = copy_to_user(lvec[*lvec_current].iov_base - + *lvec_offset, - target_kaddr, bytes_to_copy); - kunmap(process_pages[pgs_copied]); - if (ret) { - *bytes_copied += bytes_to_copy - ret; - pgs_copied++; - rc = -EFAULT; - goto end; - } - *bytes_copied += bytes_to_copy; - *lvec_offset += bytes_to_copy; - if (*lvec_offset == lvec[*lvec_current].iov_len) { - /* - * Need to copy remaining part of page into the - * next iovec if there are any bytes left in page - */ - (*lvec_current)++; - *lvec_offset = 0; - start_offset = (start_offset + bytes_to_copy) - % PAGE_SIZE; - if (start_offset) - pgs_copied--; + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; + + if (copy > len) + copy = len; + + if (vm_write) { + if (copy > iov_iter_count(iter)) + copy = iov_iter_count(iter); + copied = iov_iter_copy_from_user(page, iter, + offset, copy); + iov_iter_advance(iter, copied); + set_page_dirty_lock(page); } else { - start_offset = 0; - } - } - -end: - if (vm_write) { - for (j = 0; j < pages_pinned; j++) { - if (j < pgs_copied) - set_page_dirty_lock(process_pages[j]); - put_page(process_pages[j]); + copied = copy_page_to_iter(page, offset, copy, iter); } - } else { - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; } - - return rc; + return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ @@ -155,67 +70,60 @@ end: * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from - * @lvec: iovec array specifying where to copy to/from locally - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec *lvec, - unsigned long lvec_cnt, - unsigned long *lvec_current, - size_t *lvec_offset, + struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; - ssize_t bytes_copied_loop; ssize_t rc = 0; - unsigned long nr_pages_copied = 0; - unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - *bytes_copied = 0; - /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { - nr_pages_to_copy = min(nr_pages - nr_pages_copied, - max_pages_per_loop); + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; - rc = process_vm_rw_pages(task, mm, process_pages, pa, - start_offset, len, - lvec, lvec_cnt, - lvec_current, lvec_offset, - vm_write, nr_pages_to_copy, - &bytes_copied_loop); - start_offset = 0; - *bytes_copied += bytes_copied_loop; + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages = get_user_pages(task, mm, pa, pages, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); - if (rc < 0) { - return rc; - } else { - len -= bytes_copied_loop; - nr_pages_copied += nr_pages_to_copy; - pa += nr_pages_to_copy * PAGE_SIZE; - } + if (pages <= 0) + return -EFAULT; + + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); } return rc; @@ -228,8 +136,7 @@ static int process_vm_rw_single_vec(unsigned long addr, /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to - * @lvec: iovec array specifying where to copy to/from locally - * @liovcnt: size of lvec array + * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused @@ -238,8 +145,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * return less bytes than expected if an error occurs during the copying * process. */ -static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, - unsigned long liovcnt, +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) @@ -250,13 +156,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, struct mm_struct *mm; unsigned long i; ssize_t rc = 0; - ssize_t bytes_copied_loop; - ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long iov_l_curr_idx = 0; - size_t iov_l_curr_offset = 0; ssize_t iov_len; + size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need @@ -310,24 +213,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, - process_pages, mm, task, vm_write, &bytes_copied_loop); - bytes_copied += bytes_copied_loop; - if (rc != 0) { - /* If we have managed to copy any data at all then - we return the number of bytes copied. Otherwise - we return the error code */ - if (bytes_copied) - rc = bytes_copied; - goto put_mm; - } - } + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; - rc = bytes_copied; -put_mm: mmput(mm); put_task_struct: @@ -363,6 +262,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc; if (flags != 0) @@ -371,20 +271,21 @@ static ssize_t process_vm_rw(pid_t pid, /* Check iovecs */ if (vm_write) rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l, 1); + iovstack_l, &iov_l); else rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, - iovstack_l, &iov_l, 1); + iovstack_l, &iov_l); if (rc <= 0) goto free_iovecs; - rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, - iovstack_r, &iov_r, 0); + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); + + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -412,7 +313,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, #ifdef CONFIG_COMPAT -asmlinkage ssize_t +static ssize_t compat_process_vm_rw(compat_pid_t pid, const struct compat_iovec __user *lvec, unsigned long liovcnt, @@ -424,65 +325,56 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc = -EFAULT; if (flags != 0) return -EINVAL; - if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) - goto out; - - if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) - goto out; - if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, - &iov_l, 1); + &iov_l); else rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, iovstack_l, - &iov_l, 1); + &iov_l); if (rc <= 0) goto free_iovecs; - rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); + rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, - &iov_r, 0); + &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); if (iov_l != iovstack_l) kfree(iov_l); - -out: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e28..0ca36a7770b1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,15 +8,17 @@ */ #include <linux/kernel.h> -#include <linux/fs.h> #include <linux/gfp.h> -#include <linux/mm.h> #include <linux/export.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagevec.h> #include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/file.h> + +#include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has @@ -46,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, if (!trylock_page(page)) BUG(); page->mapping = mapping; - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); page->mapping = NULL; unlock_page(page); } @@ -147,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -177,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -209,8 +210,6 @@ out: int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read) { - int ret = 0; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; @@ -224,39 +223,23 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); - if (err < 0) { - ret = err; - break; - } - ret += err; + if (err < 0) + return err; + offset += this_chunk; nr_to_read -= this_chunk; } - return ret; + return 0; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); -} - -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; + return min(nr, MAX_READAHEAD); } /* @@ -349,7 +332,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -369,10 +352,10 @@ static int try_context_readahead(struct address_space *mapping, size = count_history_pages(mapping, ra, offset, max); /* - * no history pages: + * not enough history pages: * it could be a random read */ - if (!size) + if (size <= req_size) return 0; /* @@ -383,8 +366,8 @@ static int try_context_readahead(struct address_space *mapping, size *= 2; ra->start = offset; - ra->size = get_init_ra_size(size + req_size, max); - ra->async_size = ra->size; + ra->size = min(size + req_size, max); + ra->async_size = 1; return 1; } @@ -399,6 +382,7 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); + pgoff_t prev_offset; /* * start of file @@ -428,7 +412,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) @@ -450,8 +434,11 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss + * trivial case: (offset - prev_offset) == 1 + * unaligned reads: (offset - prev_offset) == 0 */ - if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + if (offset - prev_offset <= 1UL) goto initial_readahead; /* @@ -562,3 +549,33 @@ page_cache_async_readahead(struct address_space *mapping, ondemand_readahead(mapping, ra, filp, true, offset, req_size); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + pgoff_t index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops) + return -EINVAL; + + return force_page_cache_readahead(mapping, filp, index, nr); +} + +SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) +{ + ssize_t ret; + struct fd f; + + ret = -EBADF; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_READ) { + struct address_space *mapping = f.file->f_mapping; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, f.file, start, len); + } + fdput(f); + } + return ret; +} diff --git a/mm/rmap.c b/mm/rmap.c index 5b5ad584ffb7..83bfafabb47b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex - * anon_vma->mutex + * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@ * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock */ @@ -56,6 +56,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> +#include <linux/backing-dev.h> #include <asm/tlbflush.h> @@ -86,25 +87,25 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma() such that + * Synchronize against page_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * mutex_trylock() from page_lock_anon_vma(). This orders: + * down_read_trylock() from page_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma() VS put_anon_vma() - * mutex_trylock() atomic_dec_and_test() + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() mutex_is_locked() + * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ - if (mutex_is_locked(&anon_vma->root->mutex)) { - anon_vma_lock(anon_vma); - anon_vma_unlock(anon_vma); + if (rwsem_is_locked(&anon_vma->root->rwsem)) { + anon_vma_lock_write(anon_vma); + anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); @@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - - /* - * It's critical to add new vmas to the tail of the anon_vma, - * see comment in huge_memory.c:__split_huge_page(). - */ - list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -150,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -185,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -195,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) avc = NULL; } spin_unlock(&mm->page_table_lock); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); @@ -223,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); root = new_root; - mutex_lock(&root->mutex); + down_write(&root->rwsem); } return root; } @@ -233,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); } /* @@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } /* - * Some rmap walk that needs to find all ptes/hugepmds without false - * negatives (like migrate and split_huge_page) running concurrent - * with operations that copy or move pagetables (like mremap() and - * fork()) to be safe. They depend on the anon_vma "same_anon_vma" - * list to be in a certain order: the dst_vma must be placed after the - * src_vma in the list. This is always guaranteed by fork() but - * mremap() needs to call this function to enforce it in case the - * dst_vma isn't newly allocated and chained with the anon_vma_clone() - * function but just an extension of a pre-existing vma through - * vma_merge. - * - * NOTE: the same_anon_vma list can still be changed by other - * processes while mremap runs because mremap doesn't hold the - * anon_vma mutex to prevent modifications to the list while it - * runs. All we need to enforce is that the relative order of this - * process vmas isn't changing (we don't care about other vmas - * order). Each vma corresponds to an anon_vma_chain structure so - * there's no risk that other processes calling anon_vma_moveto_tail() - * and changing the same_anon_vma list under mremap() will screw with - * the relative order of this process vmas in the list, because we - * they can't alter the order of any vma that belongs to this - * process. And there can't be another anon_vma_moveto_tail() running - * concurrently with mremap() coming from this process because we hold - * the mmap_sem for the whole mremap(). fork() ordering dependency - * also shouldn't be affected because fork() only cares that the - * parent vmas are placed in the list before the child vmas and - * anon_vma_moveto_tail() won't reorder vmas from either the fork() - * parent or child. - */ -void anon_vma_moveto_tail(struct vm_area_struct *dst) -{ - struct anon_vma_chain *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma = pavc->anon_vma; - VM_BUG_ON(pavc->vma != dst); - root = lock_anon_vma_root(root, anon_vma); - list_del(&pavc->same_anon_vma); - list_add_tail(&pavc->same_anon_vma, &anon_vma->head); - } - unlock_anon_vma_root(root); -} - -/* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. @@ -355,9 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); - anon_vma_unlock(anon_vma); + anon_vma_unlock_write(anon_vma); return 0; @@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); - list_del(&avc->same_anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ - if (list_empty(&anon_vma->head)) + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) continue; list_del(&avc->same_vma); @@ -398,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() - * needing to acquire the anon_vma->root->mutex. + * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; @@ -414,9 +365,9 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - mutex_init(&anon_vma->mutex); + init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); - INIT_LIST_HEAD(&anon_vma->head); + anon_vma->rb_root = RB_ROOT; } void __init anon_vma_init(void) @@ -491,7 +442,7 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -506,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (mutex_trylock(&root_anon_vma->mutex)) { + if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - mutex_unlock(&root_anon_vma->mutex); + up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -533,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock() recursion. + * we'll deadlock on the anon_vma_lock_write() recursion. */ - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -553,29 +504,33 @@ out: return anon_vma; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } /* * At what user virtual address is page expected in @vma? - * Returns virtual address or -EFAULT if page's index/offset is not - * within the range mapped the @vma. */ -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; if (unlikely(is_vm_hugetlb_page(vma))) pgoff = page->index << huge_page_order(page_hstate(page)); - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } + + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + return address; } @@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { + unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -600,7 +556,31 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else return -EFAULT; - return vma_address(page, vma); + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; +} + +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + pmd = NULL; +out: + return pmd; } /* @@ -615,29 +595,24 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ pte = huge_pte_offset(mm, address); - ptl = &mm->page_table_lock; + if (!pte) + return NULL; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; } - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; if (pmd_trans_huge(*pmd)) return NULL; @@ -674,8 +649,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *pte; spinlock_t *ptl; - address = vma_address(page, vma); - if (address == -EFAULT) /* out of vma range */ + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) return 0; pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); if (!pte) /* the page is not in this mm */ @@ -685,46 +660,47 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; /* - * Subfunctions of page_referenced: page_referenced_one called - * repeatedly from either page_referenced_anon or page_referenced_file. + * arg: page_referenced_arg will be passed */ int page_referenced_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, unsigned int *mapcount, - unsigned long *vm_flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; + struct page_referenced_arg *pra = arg; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); - goto out; - } + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + spin_unlock(ptl); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter @@ -732,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, */ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) - goto out; + return SWAP_AGAIN; if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - *mapcount = 0; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out; + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ } if (ptep_clear_flush_young_notify(vma, address, pte)) { @@ -749,128 +724,33 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, * mapping is already gone, the unmap path will have * set PG_referenced or activated the page. */ - if (likely(!VM_SequentialReadHint(vma))) + if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } pte_unmap_unlock(pte, ptl); } - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; + } - (*mapcount)--; + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ - if (referenced) - *vm_flags |= vma->vm_flags; -out: - return referenced; + return SWAP_AGAIN; } -static int page_referenced_anon(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) { - unsigned int mapcount; - struct anon_vma *anon_vma; - struct anon_vma_chain *avc; - int referenced = 0; + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) - return referenced; - - mapcount = page_mapcount(page); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { - struct vm_area_struct *vma = avc->vma; - unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } - - page_unlock_anon_vma(anon_vma); - return referenced; -} - -/** - * page_referenced_file - referenced check for object-based rmap - * @page: the page we're checking references on. - * @memcg: target memory control group - * @vm_flags: collect encountered vma->vm_flags who actually referenced the page - * - * For an object-based mapped page, find all the places it is mapped and - * check/clear the referenced flag. This is done by following the page->mapping - * pointer, then walking the chain of vmas it holds. It returns the number - * of references it found. - * - * This function is only called from page_referenced for object-based pages. - */ -static int page_referenced_file(struct page *page, - struct mem_cgroup *memcg, - unsigned long *vm_flags) -{ - unsigned int mapcount; - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int referenced = 0; - - /* - * The caller's checks on page->mapping and !PageAnon have made - * sure that this is a file page: the check for page->mapping - * excludes the case just before it gets set on an anon page. - */ - BUG_ON(PageAnon(page)); - - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_mutex. - */ - BUG_ON(!PageLocked(page)); - - mutex_lock(&mapping->i_mmap_mutex); - - /* - * i_mmap_mutex does not stabilize mapcount at all, but mapcount - * is more likely to be accurate if we note it after spinning. - */ - mapcount = page_mapcount(page); - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; - /* - * If we are reclaiming on behalf of a cgroup, skip - * counting on behalf of references from different - * cgroups - */ - if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) - continue; - referenced += page_referenced_one(page, vma, address, - &mapcount, vm_flags); - if (!mapcount) - break; - } + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; - mutex_unlock(&mapping->i_mmap_mutex); - return referenced; + return false; } /** @@ -888,44 +768,57 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int referenced = 0; + int ret; int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; *vm_flags = 0; - if (page_mapped(page) && page_rmapping(page)) { - if (!is_locked && (!PageAnon(page) || PageKsm(page))) { - we_locked = trylock_page(page); - if (!we_locked) { - referenced++; - goto out; - } - } - if (unlikely(PageKsm(page))) - referenced += page_referenced_ksm(page, memcg, - vm_flags); - else if (PageAnon(page)) - referenced += page_referenced_anon(page, memcg, - vm_flags); - else if (page->mapping) - referenced += page_referenced_file(page, memcg, - vm_flags); - if (we_locked) - unlock_page(page); - - if (page_test_and_clear_young(page_to_pfn(page))) - referenced++; + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; } -out: - return referenced; + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; } static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, - unsigned long address) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; spinlock_t *ptl; int ret = 0; + int *cleaned = arg; pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) @@ -935,7 +828,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush_notify(vma, address, pte); + entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -943,48 +836,45 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } pte_unmap_unlock(pte, ptl); + + if (ret) { + mmu_notifier_invalidate_page(mm, address); + (*cleaned)++; + } out: - return ret; + return SWAP_AGAIN; } -static int page_mkclean_file(struct address_space *mapping, struct page *page) +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int ret = 0; - - BUG_ON(PageAnon(page)); + if (vma->vm_flags & VM_SHARED) + return false; - mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) { - unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; - ret += page_mkclean_one(page, vma, address); - } - } - mutex_unlock(&mapping->i_mmap_mutex); - return ret; + return true; } int page_mkclean(struct page *page) { - int ret = 0; + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; BUG_ON(!PageLocked(page)); - if (page_mapped(page)) { - struct address_space *mapping = page_mapping(page); - if (mapping) { - ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } - } + if (!page_mapped(page)) + return 0; - return ret; + mapping = page_mapping(page); + if (!mapping) + return 0; + + rmap_walk(page, &rwc); + + return cleaned; } EXPORT_SYMBOL_GPL(page_mkclean); @@ -1004,9 +894,9 @@ void page_move_anon_rmap(struct page *page, { struct anon_vma *anon_vma = vma->anon_vma; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON(!anon_vma); - VM_BUG_ON(page->index != linear_page_index(vma, address)); + VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; @@ -1096,16 +986,16 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { - if (!PageTransHuge(page)) - __inc_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); } if (unlikely(PageKsm(page))) return; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); /* address might be in next vma when migration races vma_adjust */ if (first) __page_set_anon_rmap(page, vma, address, exclusive); @@ -1129,14 +1019,15 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (!PageTransHuge(page)) - __inc_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (page_evictable(page, vma)) - lru_cache_add_lru(page, LRU_ACTIVE_ANON); - else + if (!mlocked_vma_newpage(vma, page)) { + SetPageActive(page); + lru_cache_add(page); + } else add_page_to_unevictable_list(page); } @@ -1154,7 +1045,7 @@ void page_add_file_rmap(struct page *page) mem_cgroup_begin_update_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } mem_cgroup_end_update_page_stat(page, &locked, &flags); } @@ -1184,16 +1075,6 @@ void page_remove_rmap(struct page *page) goto out; /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - */ - if ((!anon || PageSwapCache(page)) && - page_test_and_clear_dirty(page_to_pfn(page), 1)) - set_page_dirty(page); - /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. */ @@ -1201,15 +1082,18 @@ void page_remove_rmap(struct page *page) goto out; if (anon) { mem_cgroup_uncharge_page(page); - if (!PageTransHuge(page)) - __dec_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_end_update_page_stat(page, &locked, &flags); } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1219,23 +1103,24 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ + return; out: if (!anon) mem_cgroup_end_update_page_stat(page, &locked, &flags); } /* - * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. + * @arg: enum ttu_flags will be passed to this argument */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, enum ttu_flags flags) + unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) @@ -1262,7 +1147,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1272,14 +1157,27 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (!PageHuge(page)) { + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); + } + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ if (PageAnon(page)) dec_mm_counter(mm, MM_ANONPAGES); else dec_mm_counter(mm, MM_FILEPAGES); - set_pte_at(mm, address, pte, - swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1308,7 +1206,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { @@ -1324,6 +1225,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1334,7 +1237,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->mutex or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1380,14 +1283,14 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t pteval; spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ unsigned long end; int ret = SWAP_AGAIN; int locked_vma = 0; @@ -1399,17 +1302,13 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (end > vma->vm_end) end = vma->vm_end; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return ret; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return ret; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return ret; + mmun_start = address; + mmun_end = end; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, @@ -1433,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } @@ -1444,11 +1353,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush_notify(vma, address, pte); + pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1460,129 +1373,25 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (locked_vma) up_read(&vma->vm_mm->mmap_sem); return ret; } -bool is_vma_temporary_stack(struct vm_area_struct *vma) +static int try_to_unmap_nonlinear(struct page *page, + struct address_space *mapping, void *arg) { - int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); - - if (!maybe_stack) - return false; - - if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == - VM_STACK_INCOMPLETE_SETUP) - return true; - - return false; -} - -/** - * try_to_unmap_anon - unmap or unlock anonymous page using the object-based - * rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * anonymous pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) -{ - struct anon_vma *anon_vma; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; - - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) - return ret; - - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { - struct vm_area_struct *vma = avc->vma; - unsigned long address; - - /* - * During exec, a temporary VMA is setup and later moved. - * The VMA is moved under the anon_vma lock but not the - * page tables leading to a race where migration cannot - * find the migration ptes. Rather than increasing the - * locking requirements of exec(), migration skips - * temporary VMAs until after exec() completes. - */ - if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && - is_vma_temporary_stack(vma)) - continue; - - address = vma_address(page, vma); - if (address == -EFAULT) - continue; - ret = try_to_unmap_one(page, vma, address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - break; - } - - page_unlock_anon_vma(anon_vma); - return ret; -} - -/** - * try_to_unmap_file - unmap/unlock file page using the object-based rmap method - * @page: the page to unmap/unlock - * @flags: action and flags - * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the address_space struct it points to. - * - * This function is only called from try_to_unmap/try_to_munlock for - * object-based pages. - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * 'LOCKED. - */ -static int try_to_unmap_file(struct page *page, enum ttu_flags flags) -{ - struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; - mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - unsigned long address = vma_address(page, vma); - if (address == -EFAULT) - continue; - ret = try_to_unmap_one(page, vma, address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; - } - - if (list_empty(&mapping->i_mmap_nonlinear)) - goto out; - - /* - * We don't bother to try to find the munlocked page in nonlinears. - * It's costly. Instead, later, page reclaim logic may call - * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. - */ - if (TTU_ACTION(flags) == TTU_MUNLOCK) - goto out; + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1592,8 +1401,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) } if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ - ret = SWAP_FAIL; - goto out; + return SWAP_FAIL; } /* @@ -1605,7 +1413,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) */ mapcount = page_mapcount(page); if (!mapcount) - goto out; + return ret; + cond_resched(); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; @@ -1613,10 +1422,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) max_nl_cursor = CLUSTER_SIZE; do { - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + cursor = (unsigned long) vma->vm_private_data; - while ( cursor < max_nl_cursor && + while (cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { if (try_to_unmap_cluster(cursor, &mapcount, vma, page) == SWAP_MLOCK) @@ -1624,7 +1434,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) - goto out; + return ret; } vma->vm_private_data = (void *) max_nl_cursor; } @@ -1637,13 +1447,36 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; -out: - mutex_unlock(&mapping->i_mmap_mutex); + return ret; } +bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped @@ -1661,16 +1494,29 @@ out: int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .file_nonlinear = try_to_unmap_nonlinear, + .anon_lock = page_lock_anon_vma_read, + }; - BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); + VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); - if (unlikely(PageKsm(page))) - ret = try_to_unmap_ksm(page, flags); - else if (PageAnon(page)) - ret = try_to_unmap_anon(page, flags); - else - ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1693,103 +1539,166 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) */ int try_to_munlock(struct page *page) { - VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + /* + * We don't bother to try to find the munlocked page in + * nonlinears. It's costly. Instead, later, page reclaim logic + * may call try_to_unmap() and recover PG_mlocked lazily. + */ + .file_nonlinear = NULL, + .anon_lock = page_lock_anon_vma_read, - if (unlikely(PageKsm(page))) - return try_to_unmap_ksm(page, TTU_MUNLOCK); - else if (PageAnon(page)) - return try_to_unmap_anon(page, TTU_MUNLOCK); - else - return try_to_unmap_file(page, TTU_MUNLOCK); + }; + + VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + + ret = rmap_walk(page, &rwc); + return ret; } void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; + anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); - - anon_vma_free(anon_vma); } -#ifdef CONFIG_MIGRATION -/* - * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): - * Called by migrate.c to remove migration ptes, but might be used more later. - */ -static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; + + if (rwc->anon_lock) + return rwc->anon_lock(page); /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); if (!anon_vma) + return NULL; + + anon_vma_lock_read(anon_vma); + return anon_vma; +} + +/* + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct anon_vma_chain *avc; + int ret = SWAP_AGAIN; + + anon_vma = rmap_walk_anon_lock(page, rwc); + if (!anon_vma) return ret; - anon_vma_lock(anon_vma); - list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); - if (address == -EFAULT) + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rmap_one(page, vma, address, arg); + + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) break; + if (rwc->done && rwc->done(page)) + break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); return ret; } -static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page->index << compound_order(page); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_mutex. + */ + VM_BUG_ON(!PageLocked(page)); + if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); - if (address == -EFAULT) + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rmap_one(page, vma, address, arg); + + ret = rwc->rmap_one(page, vma, address, rwc->arg); if (ret != SWAP_AGAIN) - break; + goto done; + if (rwc->done && rwc->done(page)) + goto done; } - /* - * No nonlinear handling: being always shared, nonlinear vmas - * never contain migration ptes. Decide what to do about this - * limitation to linear when we need rmap_walk() on nonlinear. - */ + + if (!rwc->file_nonlinear) + goto done; + + if (list_empty(&mapping->i_mmap_nonlinear)) + goto done; + + ret = rwc->file_nonlinear(page, mapping, rwc->arg); + +done: mutex_unlock(&mapping->i_mmap_mutex); return ret; } -int rmap_walk(struct page *page, int (*rmap_one)(struct page *, - struct vm_area_struct *, unsigned long, void *), void *arg) +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) { - VM_BUG_ON(!PageLocked(page)); - if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rmap_one, arg); + return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rmap_one, arg); + return rmap_walk_anon(page, rwc); else - return rmap_walk_file(page, rmap_one, arg); + return rmap_walk_file(page, rwc); } -#endif /* CONFIG_MIGRATION */ #ifdef CONFIG_HUGETLB_PAGE /* diff --git a/mm/shmem.c b/mm/shmem.c index f99ff3e50bd6..9f70e02111c6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -25,11 +25,13 @@ #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> +#include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> +#include <linux/aio.h> static struct vfsmount *shm_mnt; @@ -43,7 +45,7 @@ static struct vfsmount *shm_mnt; #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> -#include <linux/generic_acl.h> +#include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> @@ -53,6 +55,7 @@ static struct vfsmount *shm_mnt; #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> +#include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> @@ -76,11 +79,16 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 -struct shmem_xattr { - struct list_head list; /* anchored by shmem_inode_info->xattr_list */ - char *name; /* xattr name */ - size_t size; - char value[0]; +/* + * shmem_fallocate and shmem_writepage communicate via inode->i_private + * (with i_mutex making sure that it has only one user at a time): + * we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; /* Flag allocation requirements to shmem_getpage */ @@ -88,7 +96,8 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; #ifdef CONFIG_TMPFS @@ -103,6 +112,9 @@ static unsigned long shmem_default_max_inodes(void) } #endif +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); @@ -230,63 +242,70 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } /* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error = 0; + int error; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); if (!expected) - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); if (!error) { - page_cache_get(page); - page->mapping = mapping; - page->index = index; - - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, - index, page); - else - error = shmem_radix_tree_replace(mapping, index, - expected, page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); - } - if (!expected) - radix_tree_preload_end(); + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - if (error) - mem_cgroup_uncharge_cache_page(page); return error; } @@ -310,85 +329,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - unsigned int i; - unsigned int ret; - unsigned int nr_found; - - rcu_read_lock(); -restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, indices, start, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { - struct page *page; -repeat: - page = radix_tree_deref_slot((void **)pages[i]); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = indices[i]; - pages[ret] = page; - ret++; - } - if (unlikely(!ret && nr_found)) - goto restart; - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -409,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -423,28 +377,32 @@ void shmem_unlock_mapping(struct address_space *mapping) /* * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; pgoff_t index; int i; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + if (lend == -1) + end = -1; /* unsigned, so actually very big */ pagevec_init(&pvec, 0); index = start; - while (index <= end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - pvec.pages, indices); + while (index < end) { + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -452,10 +410,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; @@ -463,44 +423,64 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (!trylock_page(page)) continue; - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + truncate_inode_page(mapping, page); + } } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } - if (partial) { + if (partial_start) { struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - zero_user_segment(page, partial, PAGE_CACHE_SIZE); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); set_page_dirty(page); unlock_page(page); page_cache_release(page); } } + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, - pvec.pages, indices); + + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) { - if (index == start) + if (index == start || unfalloc) break; index = start; continue; } - if (index == start && indices[0] > end) { - shmem_deswap_pagevec(&pvec); + if ((index == start || unfalloc) && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -509,23 +489,27 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; } lock_page(page); - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + truncate_inode_page(mapping, page); + } } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -535,7 +519,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); spin_unlock(&info->lock); +} +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -567,17 +555,14 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); -#ifdef CONFIG_TMPFS_POSIX_ACL if (attr->ia_valid & ATTR_MODE) - error = generic_acl_chmod(inode); -#endif + error = posix_acl_chmod(inode, inode->i_mode); return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { shmem_unacct_size(info->flags, inode->i_size); @@ -591,25 +576,23 @@ static void shmem_evict_inode(struct inode *inode) } else kfree(info->symlink); - list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { - kfree(xattr->name); - kfree(xattr); - } - BUG_ON(inode->i_blocks); + simple_xattrs_free(&info->xattrs); + WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); - end_writeback(inode); + clear_inode(inode); } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page *page) + swp_entry_t swap, struct page **pagep) { struct address_space *mapping = info->vfs_inode.i_mapping; void *radswap; pgoff_t index; - int error; + gfp_t gfp; + int error = 0; radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); @@ -625,22 +608,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + /* * We rely on shmem_swaplist_mutex, not only to protect the swaplist, * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = shmem_add_to_page_cache(page, mapping, index, + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, GFP_NOWAIT, radswap); - /* which does mem_cgroup_uncharge_cache_page on error */ - if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which * only does trylock page: if we raced, best clean up here. */ - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); if (!error) { spin_lock(&info->lock); info->swapped--; @@ -660,14 +669,21 @@ int shmem_unuse(swp_entry_t swap, struct page *page) struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; - int error; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: caller will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) + goto out; /* * Charge page using GFP_KERNEL while we can wait, before taking * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -676,7 +692,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, page); + found = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); @@ -685,8 +701,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) } mutex_unlock(&shmem_swaplist_mutex); - if (!found) - mem_cgroup_uncharge_cache_page(page); if (found < 0) error = found; out: @@ -727,6 +741,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ goto redirty; } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. + */ + if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + swap = get_swap_page(); if (!swap.val) goto redirty; @@ -777,7 +823,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ - mpol_to_str(buffer, sizeof(buffer), mpol, 1); + mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } @@ -798,35 +844,43 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct mempolicy mpol, *spol; struct vm_area_struct pvma; - - spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, index)); + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; - pvma.vm_policy = spol; - return swapin_readahead(swap, gfp, &pvma, 0); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = swapin_readahead(swap, gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - /* - * alloc_page_vma() will drop the shared policy reference - */ - return alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma(gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS @@ -856,6 +910,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); + + __set_page_locked(newpage); + SetPageUptodate(newpage); + SetPageSwapBacked(newpage); + set_page_private(newpage, swap_index); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } + spin_unlock_irq(&swap_mapping->tree_lock); + + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + *pagep = newpage; + } + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return error; +} + +/* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the @@ -872,30 +1009,33 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, swp_entry_t swap; int error; int once = 0; + int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; } - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto failed; } + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } if (page || (sgp == SGP_READ && !swap.val)) { - /* - * Once we can get the page lock, it must be uptodate: - * if there were an error in reading back from swap, - * the page would not be inserted into the filecache. - */ - BUG_ON(page && !PageUptodate(page)); *pagep = page; return 0; } @@ -923,26 +1063,43 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; /* try again */ + goto unlock; + } if (!PageUptodate(page)) { error = -EIO; goto failed; } wait_on_page_writeback(page); - /* Someone may have already done it for us */ - if (page->mapping) { - if (page->mapping == mapping && - page->index == index) - goto done; - error = -EEXIST; - goto failed; + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) + if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest (including mem_cgroup_uncharge_swapcache). + * Reset swap.val? No, leave it so "failed" goes back to + * "repeat": reading a hole and writing should succeed. + */ + if (error) + delete_from_swap_cache(page); + } if (error) goto failed; @@ -977,13 +1134,20 @@ repeat: SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); if (error) goto decused; + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto decused; + } lru_cache_add_anon(page); spin_lock(&info->lock); @@ -991,19 +1155,36 @@ repeat: inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock(&info->lock); + alloced = true; - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } if (sgp == SGP_DIRTY) set_page_dirty(page); } -done: + /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto trunc; + if (alloced) + goto trunc; + else + goto failed; } *pagep = page; return 0; @@ -1012,6 +1193,7 @@ done: * Error recovery. */ trunc: + info = SHMEM_I(inode); ClearPageDirty(page); delete_from_page_cache(page); spin_lock(&info->lock); @@ -1019,19 +1201,16 @@ trunc: inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); decused: + sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL) { - struct page *test = find_get_page(mapping, index); - if (test && !radix_tree_exceptional_entry(test)) - page_cache_release(test); - /* Have another try if the entry has changed */ - if (test != swp_to_radix_entry(swap)) - error = -EEXIST; - } + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: if (page) { unlock_page(page); page_cache_release(page); @@ -1043,14 +1222,14 @@ failed: spin_unlock(&info->lock); goto repeat; } - if (error == -EEXIST) + if (error == -EEXIST) /* from above or from radix_tree_insert */ goto repeat; return error; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); int error; int ret = VM_FAULT_LOCKED; @@ -1068,14 +1247,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); pgoff_t index; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -1085,7 +1264,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, int shmem_lock(struct file *file, int lock, struct user_struct *user) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; @@ -1112,7 +1291,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1139,7 +1317,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->xattr_list); + simple_xattrs_init(&info->xattrs); cache_no_acl(inode); switch (mode & S_IFMT) { @@ -1174,6 +1352,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1204,6 +1387,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1211,13 +1402,25 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; + int error = 0; + ssize_t retval; + size_t count; + loff_t *ppos = &iocb->ki_pos; + struct iov_iter iter; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + iov_iter_init(&iter, iov, nr_segs, count, 0); /* * Might this read be for a stacking filesystem? Then when reading @@ -1245,10 +1448,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; break; } if (page) @@ -1292,61 +1495,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = copy_page_to_iter(page, offset, nr, &iter); + retval += ret; offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(&iter)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); -} - -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; + file_accessed(file); + return retval ? retval : error; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, @@ -1365,6 +1533,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1384,7 +1553,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); @@ -1453,7 +1622,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; @@ -1462,6 +1631,195 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int whence) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = find_get_entries(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (whence == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (whence == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && whence == SEEK_DATA) || + (!page && whence == SEEK_HOLE)) { + done = true; + break; + } + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t start, end; + loff_t new_offset; + + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek_size(file, offset, whence, + MAX_LFS_FILESIZE, i_size_read(inode)); + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, whence); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (whence == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + mutex_unlock(&inode->i_mutex); + return offset; +} + +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_falloc shmem_falloc; + pgoff_t start, index, end; + int error; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + error = 0; + goto out; + } + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; + else + error = shmem_getpage(inode, index, &page, SGP_FALLOC, + NULL); + if (error) { + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); + goto undone; + } + + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + + /* + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -1494,30 +1852,49 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#else + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = 0; -#endif dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } return error; +out_iput: + iput(inode); + return error; +} + +static int +shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (inode) { + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(dentry, inode); + } + return error; +out_iput: + iput(inode); + return error; } static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -1531,7 +1908,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { return shmem_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -1665,6 +2042,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); kunmap_atomic(kaddr); + SetPageUptodate(page); set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1711,28 +2089,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co */ /* - * Allocate new xattr and copy in the value; but leave the name to callers. - */ -static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) -{ - struct shmem_xattr *new_xattr; - size_t len; - - /* wrap around? */ - len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) - return NULL; - - new_xattr = kmalloc(len, GFP_KERNEL); - if (!new_xattr) - return NULL; - - new_xattr->size = size; - memcpy(new_xattr->value, value, size); - return new_xattr; -} - -/* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, @@ -1741,11 +2097,11 @@ static int shmem_initxattrs(struct inode *inode, { struct shmem_inode_info *info = SHMEM_I(inode); const struct xattr *xattr; - struct shmem_xattr *new_xattr; + struct simple_xattr *new_xattr; size_t len; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) return -ENOMEM; @@ -1762,95 +2118,16 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - spin_lock(&info->lock); - list_add(&new_xattr->list, &info->xattr_list); - spin_unlock(&info->lock); + simple_xattr_list_add(&info->xattrs, new_xattr); } return 0; } -static int shmem_xattr_get(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct shmem_inode_info *info; - struct shmem_xattr *xattr; - int ret = -ENODATA; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (strcmp(name, xattr->name)) - continue; - - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); - } - break; - } - spin_unlock(&info->lock); - return ret; -} - -static int shmem_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_xattr *xattr; - struct shmem_xattr *new_xattr = NULL; - int err = 0; - - /* value == NULL means remove */ - if (value) { - new_xattr = shmem_xattr_alloc(value, size); - if (!new_xattr) - return -ENOMEM; - - new_xattr->name = kstrdup(name, GFP_KERNEL); - if (!new_xattr->name) { - kfree(new_xattr); - return -ENOMEM; - } - } - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - if (!strcmp(name, xattr->name)) { - if (flags & XATTR_CREATE) { - xattr = new_xattr; - err = -EEXIST; - } else if (new_xattr) { - list_replace(&xattr->list, &new_xattr->list); - } else { - list_del(&xattr->list); - } - goto out; - } - } - if (flags & XATTR_REPLACE) { - xattr = new_xattr; - err = -ENODATA; - } else { - list_add(&new_xattr->list, &info->xattr_list); - xattr = NULL; - } -out: - spin_unlock(&info->lock); - if (xattr) - kfree(xattr->name); - kfree(xattr); - return err; -} - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL - &generic_acl_access_handler, - &generic_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; @@ -1877,6 +2154,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -1891,12 +2169,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, if (err) return err; - return shmem_xattr_get(dentry, name, buffer, size); + return simple_xattr_get(&info->xattrs, name, buffer, size); } static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -1911,15 +2190,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, if (err) return err; - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return shmem_xattr_set(dentry->d_inode, name, value, size, flags); - + return simple_xattr_set(&info->xattrs, name, value, size, flags); } static int shmem_removexattr(struct dentry *dentry, const char *name) { + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); int err; /* @@ -1934,45 +2210,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) if (err) return err; - return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); -} - -static bool xattr_is_trusted(const char *name) -{ - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + return simple_xattr_remove(&info->xattrs, name); } static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - bool trusted = capable(CAP_SYS_ADMIN); - struct shmem_xattr *xattr; - struct shmem_inode_info *info; - size_t used = 0; - - info = SHMEM_I(dentry->d_inode); - - spin_lock(&info->lock); - list_for_each_entry(xattr, &info->xattr_list, list) { - size_t len; - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; - - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } - } - spin_unlock(&info->lock); - - return used; + struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ @@ -2017,12 +2261,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, { struct inode *inode; struct dentry *dentry = NULL; - u64 inum = fid->raw[2]; - inum = (inum << 32) | fid->raw[1]; + u64 inum; if (fh_len < 3) return NULL; + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { @@ -2033,14 +2279,12 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, return dentry; } -static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, - int connectable) +static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, + struct inode *parent) { - struct inode *inode = dentry->d_inode; - if (*len < 3) { *len = 3; - return 255; + return FILEID_INVALID; } if (inode_unhashed(inode)) { @@ -2075,6 +2319,9 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, bool remount) { char *this_char, *value, *rest; + struct mempolicy *mpol = NULL; + uid_t uid; + gid_t gid; while (options != NULL) { this_char = options; @@ -2101,7 +2348,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, printk(KERN_ERR "tmpfs: No value for mount option '%s'\n", this_char); - return 1; + goto error; } if (!strcmp(this_char,"size")) { @@ -2134,29 +2381,40 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, } else if (!strcmp(this_char,"uid")) { if (remount) continue; - sbinfo->uid = simple_strtoul(value, &rest, 0); + uid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; + sbinfo->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(sbinfo->uid)) + goto bad_val; } else if (!strcmp(this_char,"gid")) { if (remount) continue; - sbinfo->gid = simple_strtoul(value, &rest, 0); + gid = simple_strtoul(value, &rest, 0); if (*rest) goto bad_val; + sbinfo->gid = make_kgid(current_user_ns(), gid); + if (!gid_valid(sbinfo->gid)) + goto bad_val; } else if (!strcmp(this_char,"mpol")) { - if (mpol_parse_str(value, &sbinfo->mpol, 1)) + mpol_put(mpol); + mpol = NULL; + if (mpol_parse_str(value, &mpol)) goto bad_val; } else { printk(KERN_ERR "tmpfs: Bad mount option %s\n", this_char); - return 1; + goto error; } } + sbinfo->mpol = mpol; return 0; bad_val: printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); +error: + mpol_put(mpol); return 1; } @@ -2168,6 +2426,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) unsigned long inodes; int error = -EINVAL; + config.mpol = NULL; if (shmem_parse_options(data, &config, true)) return error; @@ -2192,8 +2451,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } out: spin_unlock(&sbinfo->stat_lock); return error; @@ -2210,10 +2474,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); - if (sbinfo->uid != 0) - seq_printf(seq, ",uid=%u", sbinfo->uid); - if (sbinfo->gid != 0) - seq_printf(seq, ",gid=%u", sbinfo->gid); + if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, sbinfo->uid)); + if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, sbinfo->gid)); shmem_show_mpol(seq, sbinfo->mpol); return 0; } @@ -2224,6 +2490,7 @@ static void shmem_put_super(struct super_block *sb) struct shmem_sb_info *sbinfo = SHMEM_SB(sb); percpu_counter_destroy(&sbinfo->used_blocks); + mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } @@ -2251,15 +2518,18 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ - if (!(sb->s_flags & MS_NOUSER)) { + if (!(sb->s_flags & MS_KERNMOUNT)) { sbinfo->max_blocks = shmem_default_max_blocks(); sbinfo->max_inodes = shmem_default_max_inodes(); if (shmem_parse_options(data, sbinfo, false)) { err = -EINVAL; goto failed; } + } else { + sb->s_flags |= MS_NOUSER; } sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; #else sb->s_flags |= MS_NOUSER; #endif @@ -2354,7 +2624,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = generic_file_llseek, + .llseek = shmem_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, @@ -2362,17 +2632,18 @@ static const struct file_operations shmem_file_operations = { .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { .setattr = shmem_setattr, - .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, .removexattr = shmem_removexattr, + .set_acl = simple_set_acl, #endif }; @@ -2387,6 +2658,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename, + .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, @@ -2396,6 +2668,7 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; @@ -2408,6 +2681,7 @@ static const struct inode_operations shmem_special_inode_operations = { #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; @@ -2426,10 +2700,12 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif + .remap_pages = generic_file_remap_pages, }; static struct dentry *shmem_mount(struct file_system_type *fs_type, @@ -2443,12 +2719,17 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) { int error; + /* If rootfs called this, don't re-init */ + if (shmem_inode_cachep) + return 0; + error = bdi_init(&shmem_backing_dev_info); if (error) goto out4; @@ -2463,8 +2744,7 @@ int __init shmem_init(void) goto out2; } - shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, - shmem_fs_type.name, NULL); + shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); printk(KERN_ERR "Could not kern_mount tmpfs\n"); @@ -2494,12 +2774,11 @@ out4: * effectively equivalent, but much lighter weight. */ -#include <linux/ramfs.h> - static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, }; int __init shmem_init(void) @@ -2542,23 +2821,21 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +static struct dentry_operations anon_ops = { + .d_dname = simple_dname +}; + +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) { - int error; - struct file *file; + struct file *res; struct inode *inode; struct path path; - struct dentry *root; + struct super_block *sb; struct qstr this; if (IS_ERR(shm_mnt)) - return (void *)shm_mnt; + return ERR_CAST(shm_mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); @@ -2566,43 +2843,68 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - error = -ENOMEM; + res = ERR_PTR(-ENOMEM); this.name = name; this.len = strlen(name); this.hash = 0; /* will go */ - root = shm_mnt->mnt_root; - path.dentry = d_alloc(root, &this); + sb = shm_mnt->mnt_sb; + path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; + d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(shm_mnt); - error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); + res = ERR_PTR(-ENOSPC); + inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto put_dentry; + inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); inode->i_size = size; clear_nlink(inode); /* It is unlinked */ -#ifndef CONFIG_MMU - error = ramfs_nommu_expand_for_mapping(inode, size); - if (error) + res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (IS_ERR(res)) goto put_dentry; -#endif - error = -ENFILE; - file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); - if (!file) + if (IS_ERR(res)) goto put_dentry; - return file; + return res; put_dentry: path_put(&path); put_memory: shmem_unacct_size(flags, size); - return ERR_PTR(error); + return res; +} + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc/<pid>/maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); @@ -2623,7 +2925,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/mm/slab.c b/mm/slab.c index e901a36e2520..19d92181ce24 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -68,7 +68,7 @@ * Further notes from the original documentation: * * 11 April '97. Started multi-threading - markhe - * The global cache-chain is protected by the mutex 'cache_chain_mutex'. + * The global cache-chain is protected by the mutex 'slab_mutex'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). @@ -117,12 +117,18 @@ #include <linux/memory.h> #include <linux/prefetch.h> +#include <net/sock.h> + #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/page.h> #include <trace/events/kmem.h> +#include "internal.h" + +#include "slab.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -151,88 +157,22 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif -/* Legal flag mask for kmem_cache_create(). */ -#if DEBUG -# define CREATE_MASK (SLAB_RED_ZONE | \ - SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_STORE_USER | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | \ - SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ - SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) +typedef unsigned short freelist_idx_t; #endif -/* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) - -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU. This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking. We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - */ -struct slab_rcu { - struct rcu_head head; - struct kmem_cache *cachep; - void *addr; -}; +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. + * true if a page was allocated from pfmemalloc reserves for network-based + * swap */ -struct slab { - union { - struct { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; - }; - struct slab_rcu __slab_cover_slab_rcu; - }; -}; +static bool pfmemalloc_active __read_mostly; /* * struct array_cache @@ -256,9 +196,30 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. + * + * Entries should not be directly dereferenced as + * entries belonging to slabs marked pfmemalloc will + * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; +#define SLAB_OBJ_PFMEMALLOC 1 +static inline bool is_obj_pfmemalloc(void *objp) +{ + return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; +} + +static inline void set_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); + return; +} + +static inline void clear_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); +} + /* * bootstrap: The caches do not work without cpuarrays anymore, but the * cpuarrays are allocated from the generic caches... @@ -270,68 +231,27 @@ struct arraycache_init { }; /* - * The slab lists for all objects. - */ -struct kmem_list3 { - struct list_head slabs_partial; /* partial list first, better asm code */ - struct list_head slabs_full; - struct list_head slabs_free; - unsigned long free_objects; - unsigned int free_limit; - unsigned int colour_next; /* Per-node cache coloring */ - spinlock_t list_lock; - struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ - unsigned long next_reap; /* updated without locking */ - int free_touched; /* updated without locking */ -}; - -/* * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; #define CACHE_CACHE 0 #define SIZE_AC MAX_NUMNODES -#define SIZE_L3 (2 * MAX_NUMNODES) +#define SIZE_NODE (2 * MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, - struct kmem_list3 *l3, int tofree); + struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); -/* - * This function must be completely optimized away if a constant is passed to - * it. Mostly the same as what is in linux/slab.h except it returns an index. - */ -static __always_inline int index_of(const size_t size) -{ - extern void __bad_size(void); - - if (__builtin_constant_p(size)) { - int i = 0; - -#define CACHE(x) \ - if (size <=x) \ - return i; \ - else \ - i++; -#include <linux/kmalloc_sizes.h> -#undef CACHE - __bad_size(); - } else - __bad_size(); - return 0; -} - static int slab_early_init = 1; -#define INDEX_AC index_of(sizeof(struct arraycache_init)) -#define INDEX_L3 index_of(sizeof(struct kmem_list3)) +#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) +#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) -static void kmem_list3_init(struct kmem_list3 *parent) +static void kmem_cache_node_init(struct kmem_cache_node *parent) { INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); @@ -347,7 +267,7 @@ static void kmem_list3_init(struct kmem_list3 *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ + list_splice(&(cachep->node[nodeid]->slab), listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -368,8 +288,8 @@ static void kmem_list3_init(struct kmem_list3 *parent) * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */ -#define REAPTIMEOUT_CPUC (2*HZ) -#define REAPTIMEOUT_LIST3 (4*HZ) +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) #if STATS #define STATS_INC_ACTIVE(x) ((x)->num_active++) @@ -424,8 +344,8 @@ static void kmem_list3_init(struct kmem_list3 *parent) * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: * redzone word. * cachep->obj_offset: The real object. - * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->size - 1* BYTES_PER_WORD: last caller address * [BYTES_PER_WORD long] */ static int obj_offset(struct kmem_cache *cachep) @@ -433,11 +353,6 @@ static int obj_offset(struct kmem_cache *cachep) return cachep->obj_offset; } -static int obj_size(struct kmem_cache *cachep) -{ - return cachep->obj_size; -} - static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); @@ -449,37 +364,28 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); if (cachep->flags & SLAB_STORE_USER) - return (unsigned long long *)(objp + cachep->buffer_size - + return (unsigned long long *)(objp + cachep->size - sizeof(unsigned long long) - REDZONE_ALIGN); - return (unsigned long long *) (objp + cachep->buffer_size - + return (unsigned long long *) (objp + cachep->size - sizeof(unsigned long long)); } static void **dbg_userword(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_STORE_USER)); - return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); + return (void **)(objp + cachep->size - BYTES_PER_WORD); } #else #define obj_offset(x) 0 -#define obj_size(cachep) (cachep->buffer_size) #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) #endif -#ifdef CONFIG_TRACING -size_t slab_buffer_size(struct kmem_cache *cachep) -{ - return cachep->buffer_size; -} -EXPORT_SYMBOL(slab_buffer_size); -#endif - /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -489,128 +395,45 @@ EXPORT_SYMBOL(slab_buffer_size); static int slab_max_order = SLAB_MAX_ORDER_LO; static bool slab_max_order_set __initdata; -/* - * Functions for storing/retrieving the cachep and or slab from the page - * allocator. These are used to find the slab an obj belongs to. With kfree(), - * these are used to find the cache which an obj belongs to. - */ -static inline void page_set_cache(struct page *page, struct kmem_cache *cache) -{ - page->lru.next = (struct list_head *)cache; -} - -static inline struct kmem_cache *page_get_cache(struct page *page) -{ - page = compound_head(page); - BUG_ON(!PageSlab(page)); - return (struct kmem_cache *)page->lru.next; -} - -static inline void page_set_slab(struct page *page, struct slab *slab) -{ - page->lru.prev = (struct list_head *)slab; -} - -static inline struct slab *page_get_slab(struct page *page) -{ - BUG_ON(!PageSlab(page)); - return (struct slab *)page->lru.prev; -} - static inline struct kmem_cache *virt_to_cache(const void *obj) { struct page *page = virt_to_head_page(obj); - return page_get_cache(page); + return page->slab_cache; } -static inline struct slab *virt_to_slab(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - return page_get_slab(page); -} - -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { - return slab->s_mem + cache->buffer_size * idx; + return page->s_mem + cache->size * idx; } /* - * We want to avoid an expensive divide : (offset / cache->buffer_size) - * Using the fact that buffer_size is a constant for a particular cache, - * we can replace (offset / cache->buffer_size) by + * We want to avoid an expensive divide : (offset / cache->size) + * Using the fact that size is a constant for a particular cache, + * we can replace (offset / cache->size) by * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct page *page, void *obj) { - u32 offset = (obj - slab->s_mem); + u32 offset = (obj - page->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } -/* - * These are the default caches for kmalloc. Custom caches can have other sizes. - */ -struct cache_sizes malloc_sizes[] = { -#define CACHE(x) { .cs_size = (x) }, -#include <linux/kmalloc_sizes.h> - CACHE(ULONG_MAX) -#undef CACHE -}; -EXPORT_SYMBOL(malloc_sizes); - -/* Must match cache_sizes above. Out of line to keep cache footprint low. */ -struct cache_names { - char *name; - char *name_dma; -}; - -static struct cache_names __initdata cache_names[] = { -#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, -#include <linux/kmalloc_sizes.h> - {NULL,} -#undef CACHE -}; - -static struct arraycache_init initarray_cache __initdata = - { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ -static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; -static struct kmem_cache cache_cache = { - .nodelists = cache_cache_nodelists, +static struct kmem_cache kmem_cache_boot = { .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, - .buffer_size = sizeof(struct kmem_cache), + .size = sizeof(struct kmem_cache), .name = "kmem_cache", }; #define BAD_ALIEN_MAGIC 0x01020304ul -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { - NONE, - PARTIAL_AC, - PARTIAL_L3, - EARLY, - LATE, - FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ - return g_cpucache_up >= EARLY; -} - #ifdef CONFIG_LOCKDEP /* @@ -635,15 +458,15 @@ static void slab_set_lock_classes(struct kmem_cache *cachep, int q) { struct array_cache **alc; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int r; - l3 = cachep->nodelists[q]; - if (!l3) + n = cachep->node[q]; + if (!n) return; - lockdep_set_class(&l3->list_lock, l3_key); - alc = l3->alien; + lockdep_set_class(&n->list_lock, l3_key); + alc = n->alien; /* * FIXME: This check for BAD_ALIEN_MAGIC * should go away when common slab code is taught to @@ -674,23 +497,45 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) static void init_node_lock_keys(int q) { - struct cache_sizes *s = malloc_sizes; + int i; - if (g_cpucache_up < LATE) + if (slab_state < UP) return; - for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { - struct kmem_list3 *l3; + for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache_node *n; + struct kmem_cache *cache = kmalloc_caches[i]; + + if (!cache) + continue; - l3 = s->cs_cachep->nodelists[q]; - if (!l3 || OFF_SLAB(s->cs_cachep)) + n = cache->node[q]; + if (!n || OFF_SLAB(cache)) continue; - slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, + slab_set_lock_classes(cache, &on_slab_l3_key, &on_slab_alc_key, q); } } +static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) +{ + if (!cachep->node[q]) + return; + + slab_set_lock_classes(cachep, &on_slab_l3_key, + &on_slab_alc_key, q); +} + +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ + int node; + + VM_BUG_ON(OFF_SLAB(cachep)); + for_each_node(node) + on_slab_lock_classes_node(cachep, node); +} + static inline void init_lock_keys(void) { int node; @@ -707,6 +552,14 @@ static inline void init_lock_keys(void) { } +static inline void on_slab_lock_classes(struct kmem_cache *cachep) +{ +} + +static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) { } @@ -716,12 +569,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) } #endif -/* - * Guard access to the cache-chain. - */ -static DEFINE_MUTEX(cache_chain_mutex); -static struct list_head cache_chain; - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -729,44 +576,31 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static inline struct kmem_cache *__find_general_cachep(size_t size, - gfp_t gfpflags) +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) { - struct cache_sizes *csizep = malloc_sizes; + int nr_objs; + size_t freelist_size; -#if DEBUG - /* This happens if someone tries to call - * kmem_cache_create(), or __kmalloc(), before - * the generic caches are initialized. + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. */ - BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); -#endif - if (!size) - return ZERO_SIZE_PTR; - - while (size > csizep->cs_size) - csizep++; + nr_objs = slab_size / (buffer_size + idx_size); /* - * Really subtle: The last entry with cs->cs_size==ULONG_MAX - * has cs_{dma,}cachep==NULL. Thus no special case - * for large kmalloc calls required. + * This calculated number will be either the right + * amount, or one greater than what we want. */ -#ifdef CONFIG_ZONE_DMA - if (unlikely(gfpflags & GFP_DMA)) - return csizep->cs_dmacachep; -#endif - return csizep->cs_cachep; -} - -static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) -{ - return __find_general_cachep(size, gfpflags); -} + freelist_size = slab_size - nr_objs * buffer_size; + if (freelist_size < ALIGN(nr_objs * idx_size, align)) + nr_objs--; -static size_t slab_mgmt_size(size_t nr_objs, size_t align) -{ - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return nr_objs; } /* @@ -785,8 +619,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - The struct slab - * - One kmem_bufctl_t for each object + * - One unsigned int for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * @@ -799,37 +632,16 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, mgmt_size = 0; nr_objs = slab_size / buffer_size; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; } else { - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size - > slab_size) - nr_objs--; - - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; - - mgmt_size = slab_mgmt_size(nr_objs, align); + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); } *num = nr_objs; *left_over = slab_size - nr_objs*buffer_size - mgmt_size; } +#if DEBUG #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) static void __slab_error(const char *function, struct kmem_cache *cachep, @@ -838,7 +650,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#endif /* * By default on NUMA we use alien caches to stage the freeing of @@ -909,7 +723,7 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __cpuinit start_cpu_timer(int cpu) +static void start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); @@ -920,7 +734,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); + INIT_DEFERRABLE_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } @@ -951,6 +765,122 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } +static inline bool is_slab_pfmemalloc(struct page *page) +{ + return PageSlabPfmemalloc(page); +} + +/* Clears pfmemalloc_active if no slabs have pfmalloc set */ +static void recheck_pfmemalloc_active(struct kmem_cache *cachep, + struct array_cache *ac) +{ + struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct page *page; + unsigned long flags; + + if (!pfmemalloc_active) + return; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->slabs_full, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + list_for_each_entry(page, &n->slabs_partial, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + list_for_each_entry(page, &n->slabs_free, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + pfmemalloc_active = false; +out: + spin_unlock_irqrestore(&n->list_lock, flags); +} + +static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, + gfp_t flags, bool force_refill) +{ + int i; + void *objp = ac->entry[--ac->avail]; + + /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ + if (unlikely(is_obj_pfmemalloc(objp))) { + struct kmem_cache_node *n; + + if (gfp_pfmemalloc_allowed(flags)) { + clear_obj_pfmemalloc(&objp); + return objp; + } + + /* The caller cannot use PFMEMALLOC objects, find another one */ + for (i = 0; i < ac->avail; i++) { + /* If a !PFMEMALLOC object is found, swap them */ + if (!is_obj_pfmemalloc(ac->entry[i])) { + objp = ac->entry[i]; + ac->entry[i] = ac->entry[ac->avail]; + ac->entry[ac->avail] = objp; + return objp; + } + } + + /* + * If there are empty slabs on the slabs_free list and we are + * being forced to refill the cache, mark this one !pfmemalloc. + */ + n = cachep->node[numa_mem_id()]; + if (!list_empty(&n->slabs_free) && force_refill) { + struct page *page = virt_to_head_page(objp); + ClearPageSlabPfmemalloc(page); + clear_obj_pfmemalloc(&objp); + recheck_pfmemalloc_active(cachep, ac); + return objp; + } + + /* No !PFMEMALLOC objects available */ + ac->avail++; + objp = NULL; + } + + return objp; +} + +static inline void *ac_get_obj(struct kmem_cache *cachep, + struct array_cache *ac, gfp_t flags, bool force_refill) +{ + void *objp; + + if (unlikely(sk_memalloc_socks())) + objp = __ac_get_obj(cachep, ac, flags, force_refill); + else + objp = ac->entry[--ac->avail]; + + return objp; +} + +static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + void *objp) +{ + if (unlikely(pfmemalloc_active)) { + /* Some pfmemalloc slabs exist, check if this is one */ + struct page *page = virt_to_head_page(objp); + if (PageSlabPfmemalloc(page)) + set_obj_pfmemalloc(&objp); + } + + return objp; +} + +static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + void *objp) +{ + if (unlikely(sk_memalloc_socks())) + objp = __ac_put_obj(cachep, ac, objp); + + ac->entry[ac->avail++] = objp; +} + /* * Transfer objects in one arraycache to another. * Locking must be handled by the caller. @@ -977,7 +907,7 @@ static int transfer_objects(struct array_cache *to, #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) +#define reap_alien(cachep, n) do { } while (0) static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { @@ -1049,33 +979,33 @@ static void free_alien_cache(struct array_cache **ac_ptr) static void __drain_alien_cache(struct kmem_cache *cachep, struct array_cache *ac, int node) { - struct kmem_list3 *rl3 = cachep->nodelists[node]; + struct kmem_cache_node *n = cachep->node[node]; if (ac->avail) { - spin_lock(&rl3->list_lock); + spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects * into the free lists and getting them back later. */ - if (rl3->shared) - transfer_objects(rl3->shared, ac, ac->limit); + if (n->shared) + transfer_objects(n->shared, ac, ac->limit); free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; - spin_unlock(&rl3->list_lock); + spin_unlock(&n->list_lock); } } /* * Called from cache_reap() to regularly drain alien caches round robin. */ -static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) { int node = __this_cpu_read(slab_reap_node); - if (l3->alien) { - struct array_cache *ac = l3->alien[node]; + if (n->alien) { + struct array_cache *ac = n->alien[node]; if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node); @@ -1103,9 +1033,8 @@ static void drain_alien_cache(struct kmem_cache *cachep, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - struct slab *slabp = virt_to_slab(objp); - int nodeid = slabp->nodeid; - struct kmem_list3 *l3; + int nodeid = page_to_nid(virt_to_page(objp)); + struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1115,83 +1044,89 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node)) + if (likely(nodeid == node)) return 0; - l3 = cachep->nodelists[node]; + n = cachep->node[node]; STATS_INC_NODEFREES(cachep); - if (l3->alien && l3->alien[nodeid]) { - alien = l3->alien[nodeid]; + if (n->alien && n->alien[nodeid]) { + alien = n->alien[nodeid]; spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, alien, nodeid); } - alien->entry[alien->avail++] = objp; + ac_put_obj(cachep, alien, objp); spin_unlock(&alien->lock); } else { - spin_lock(&(cachep->nodelists[nodeid])->list_lock); + spin_lock(&(cachep->node[nodeid])->list_lock); free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->nodelists[nodeid])->list_lock); + spin_unlock(&(cachep->node[nodeid])->list_lock); } return 1; } #endif /* - * Allocates and initializes nodelists for a node on each slab cache, used for - * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * Allocates and initializes node for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing nodelists are not replaced if + * When hotplugging memory or a cpu, existing node are not replaced if * already in use. * - * Must hold cache_chain_mutex. + * Must hold slab_mutex. */ -static int init_cache_nodelists_node(int node) +static int init_cache_node_node(int node) { struct kmem_cache *cachep; - struct kmem_list3 *l3; - const int memsize = sizeof(struct kmem_list3); + struct kmem_cache_node *n; + const int memsize = sizeof(struct kmem_cache_node); - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { /* - * Set up the size64 kmemlist for cpu before we can + * Set up the kmem_cache_node for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) + if (!cachep->node[node]) { + n = kmalloc_node(memsize, GFP_KERNEL, node); + if (!n) return -ENOMEM; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient * protection here. */ - cachep->nodelists[node] = l3; + cachep->node[node] = n; } - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = + spin_lock_irq(&cachep->node[node]->list_lock); + cachep->node[node]->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); + spin_unlock_irq(&cachep->node[node]->list_lock); } return 0; } -static void __cpuinit cpuup_canceled(long cpu) +static inline int slabs_tofree(struct kmem_cache *cachep, + struct kmem_cache_node *n) +{ + return (n->free_objects + cachep->num - 1) / cachep->num; +} + +static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; - struct kmem_list3 *l3 = NULL; + struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; struct array_cache **alien; @@ -1199,34 +1134,34 @@ static void __cpuinit cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - l3 = cachep->nodelists[node]; + n = cachep->node[node]; - if (!l3) + if (!n) goto free_array_cache; - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - /* Free limit for this kmem_list3 */ - l3->free_limit -= cachep->batchcount; + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; if (nc) free_block(cachep, nc->entry, nc->avail, node); if (!cpumask_empty(mask)) { - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); goto free_array_cache; } - shared = l3->shared; + shared = n->shared; if (shared) { free_block(cachep, shared->entry, shared->avail, node); - l3->shared = NULL; + n->shared = NULL; } - alien = l3->alien; - l3->alien = NULL; + alien = n->alien; + n->alien = NULL; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1241,18 +1176,18 @@ free_array_cache: * the respective cache's slabs, now we can go ahead and * shrink each nodelist to its limit. */ - list_for_each_entry(cachep, &cache_chain, next) { - l3 = cachep->nodelists[node]; - if (!l3) + list_for_each_entry(cachep, &slab_caches, list) { + n = cachep->node[node]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); } } -static int __cpuinit cpuup_prepare(long cpu) +static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_list3 *l3 = NULL; + struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -1260,9 +1195,9 @@ static int __cpuinit cpuup_prepare(long cpu) * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right - * kmem_list3 and not this cpu's kmem_list3 + * kmem_cache_node and not this cpu's kmem_cache_node */ - err = init_cache_nodelists_node(node); + err = init_cache_node_node(node); if (err < 0) goto bad; @@ -1270,7 +1205,7 @@ static int __cpuinit cpuup_prepare(long cpu) * Now we can go ahead with allocating the shared arrays and * array caches */ - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; struct array_cache **alien = NULL; @@ -1297,29 +1232,32 @@ static int __cpuinit cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; - BUG_ON(!l3); + n = cachep->node[node]; + BUG_ON(!n); - spin_lock_irq(&l3->list_lock); - if (!l3->shared) { + spin_lock_irq(&n->list_lock); + if (!n->shared) { /* * We are serialised from CPU_DEAD or * CPU_UP_CANCELLED by the cpucontrol lock */ - l3->shared = shared; + n->shared = shared; shared = NULL; } #ifdef CONFIG_NUMA - if (!l3->alien) { - l3->alien = alien; + if (!n->alien) { + n->alien = alien; alien = NULL; } #endif - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); if (cachep->flags & SLAB_DEBUG_OBJECTS) slab_set_debugobj_lock_classes_node(cachep, node); + else if (!OFF_SLAB(cachep) && + !(cachep->flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1329,7 +1267,7 @@ bad: return -ENOMEM; } -static int __cpuinit cpuup_callback(struct notifier_block *nfb, +static int cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1338,9 +1276,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); err = cpuup_prepare(cpu); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -1350,7 +1288,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* - * Shutdown cache reaper. Note that the cache_chain_mutex is + * Shutdown cache reaper. Note that the slab_mutex is * held so that if cache_reap() is invoked it cannot do * anything expensive but will only modify reap_work * and reschedule the timer. @@ -1367,9 +1305,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, case CPU_DEAD_FROZEN: /* * Even if all the cpus of a node are down, we don't free the - * kmem_list3 of any cache. This to avoid a race between + * kmem_cache_node of any cache. This to avoid a race between * cpu_down, and a kmalloc allocation from another cpu for - * memory from the node of the cpu going down. The list3 + * memory from the node of the cpu going down. The node * structure is usually allocated from kmem_cache_create() and * gets destroyed at kmem_cache_destroy(). */ @@ -1377,15 +1315,15 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, #endif case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); cpuup_canceled(cpu); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); break; } return notifier_from_errno(err); } -static struct notifier_block __cpuinitdata cpucache_notifier = { +static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; @@ -1395,24 +1333,24 @@ static struct notifier_block __cpuinitdata cpucache_notifier = { * Returns -EBUSY if all objects cannot be drained so that the node is not * removed. * - * Must hold cache_chain_mutex. + * Must hold slab_mutex. */ -static int __meminit drain_cache_nodelists_node(int node) +static int __meminit drain_cache_node_node(int node) { struct kmem_cache *cachep; int ret = 0; - list_for_each_entry(cachep, &cache_chain, next) { - struct kmem_list3 *l3; + list_for_each_entry(cachep, &slab_caches, list) { + struct kmem_cache_node *n; - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); - if (!list_empty(&l3->slabs_full) || - !list_empty(&l3->slabs_partial)) { + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) { ret = -EBUSY; break; } @@ -1433,14 +1371,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self, switch (action) { case MEM_GOING_ONLINE: - mutex_lock(&cache_chain_mutex); - ret = init_cache_nodelists_node(nid); - mutex_unlock(&cache_chain_mutex); + mutex_lock(&slab_mutex); + ret = init_cache_node_node(nid); + mutex_unlock(&slab_mutex); break; case MEM_GOING_OFFLINE: - mutex_lock(&cache_chain_mutex); - ret = drain_cache_nodelists_node(nid); - mutex_unlock(&cache_chain_mutex); + mutex_lock(&slab_mutex); + ret = drain_cache_node_node(nid); + mutex_unlock(&slab_mutex); break; case MEM_ONLINE: case MEM_OFFLINE: @@ -1454,64 +1392,71 @@ out: #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ /* - * swap the static kmem_list3 with kmalloced memory + * swap the static kmem_cache_node with kmalloced memory */ -static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, +static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, int nodeid) { - struct kmem_list3 *ptr; + struct kmem_cache_node *ptr; - ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); + ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); BUG_ON(!ptr); - memcpy(ptr, list, sizeof(struct kmem_list3)); + memcpy(ptr, list, sizeof(struct kmem_cache_node)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); - cachep->nodelists[nodeid] = ptr; + cachep->node[nodeid] = ptr; } /* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. + * For setting up all the kmem_cache_node for cache whose buffer_size is same as + * size of kmem_cache_node. */ -static void __init set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_node(struct kmem_cache *cachep, int index) { int node; for_each_online_node(node) { - cachep->nodelists[node] = &initkmem_list3[index + node]; - cachep->nodelists[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + cachep->node[node] = &init_kmem_cache_node[index + node]; + cachep->node[node]->next_reap = jiffies + + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; } } /* + * The memory after the last cpu cache pointer is used for the + * the node pointer. + */ +static void setup_node_pointer(struct kmem_cache *cachep) +{ + cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; +} + +/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ void __init kmem_cache_init(void) { - size_t left_over; - struct cache_sizes *sizes; - struct cache_names *names; int i; - int order; - int node; + + BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < + sizeof(struct rcu_head)); + kmem_cache = &kmem_cache_boot; + setup_node_pointer(kmem_cache); if (num_possible_nodes() == 1) use_alien_caches = 0; - for (i = 0; i < NUM_INIT_LISTS; i++) { - kmem_list3_init(&initkmem_list3[i]); - if (i < MAX_NUMNODES) - cache_cache.nodelists[i] = NULL; - } - set_up_list3s(&cache_cache, CACHE_CACHE); + for (i = 0; i < NUM_INIT_LISTS; i++) + kmem_cache_node_init(&init_kmem_cache_node[i]); + + set_up_node(kmem_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1523,180 +1468,119 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct - * kmem_cache structures of all caches, except cache_cache itself: - * cache_cache is statically allocated. + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. * Initially an __init data area is used for the head array and the - * kmem_list3 structures, it's replaced with a kmalloc allocated + * kmem_cache_node structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. * 2) Create the first kmalloc cache. * The struct kmem_cache for the new cache is allocated normally. * An __init data area is used for the head array. * 3) Create the remaining kmalloc caches, with minimally sized * head arrays. - * 4) Replace the __init data head arrays for cache_cache and the first + * 4) Replace the __init data head arrays for kmem_cache and the first * kmalloc cache with kmalloc allocated arrays. - * 5) Replace the __init data for kmem_list3 for cache_cache and + * 5) Replace the __init data for kmem_cache_node for kmem_cache and * the other cache's with kmalloc allocated memory. * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_mem_id(); - - /* 1) create the cache_cache */ - INIT_LIST_HEAD(&cache_chain); - list_add(&cache_cache.next, &cache_chain); - cache_cache.colour_off = cache_line_size(); - cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; + /* 1) create the kmem_cache */ /* * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + - nr_node_ids * sizeof(struct kmem_list3 *); -#if DEBUG - cache_cache.obj_size = cache_cache.buffer_size; -#endif - cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, - cache_line_size()); - cache_cache.reciprocal_buffer_size = - reciprocal_value(cache_cache.buffer_size); - - for (order = 0; order < MAX_ORDER; order++) { - cache_estimate(order, cache_cache.buffer_size, - cache_line_size(), 0, &left_over, &cache_cache.num); - if (cache_cache.num) - break; - } - BUG_ON(!cache_cache.num); - cache_cache.gfporder = order; - cache_cache.colour = left_over / cache_cache.colour_off; - cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + - sizeof(struct slab), cache_line_size()); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + list_add(&kmem_cache->list, &slab_caches); /* 2+3) create the kmalloc caches */ - sizes = malloc_sizes; - names = cache_names; /* * Initialize the caches that provide memory for the array cache and the - * kmem_list3 structures first. Without this, further allocations will + * kmem_cache_node structures first. Without this, further allocations will * bug. */ - sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); - - if (INDEX_AC != INDEX_L3) { - sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); - } + kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", + kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); + + if (INDEX_AC != INDEX_NODE) + kmalloc_caches[INDEX_NODE] = + create_kmalloc_cache("kmalloc-node", + kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_early_init = 0; - while (sizes->cs_size != ULONG_MAX) { - /* - * For performance, all the general caches are L1 aligned. - * This should be particularly beneficial on SMP boxes, as it - * eliminates "false sharing". - * Note for systems short on memory removing the alignment will - * allow tighter packing of the smaller caches. - */ - if (!sizes->cs_cachep) { - sizes->cs_cachep = kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_PANIC, - NULL); - } -#ifdef CONFIG_ZONE_DMA - sizes->cs_dmacachep = kmem_cache_create( - names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| - SLAB_PANIC, - NULL); -#endif - sizes++; - names++; - } /* 4) Replace the bootstrap head arrays */ { struct array_cache *ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, cpu_cache_get(&cache_cache), + memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - cache_cache.array[smp_processor_id()] = ptr; + kmem_cache->array[smp_processor_id()] = ptr; ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); - BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) + BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) != &initarray_generic.cache); - memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), + memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); /* * Do not assume that spinlocks can be initialized via memcpy: */ spin_lock_init(&ptr->lock); - malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = - ptr; + kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } - /* 5) Replace the bootstrap kmem_list3's */ + /* 5) Replace the bootstrap kmem_cache_node */ { int nid; for_each_online_node(nid) { - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); + init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); - init_list(malloc_sizes[INDEX_AC].cs_cachep, - &initkmem_list3[SIZE_AC + nid], nid); + init_list(kmalloc_caches[INDEX_AC], + &init_kmem_cache_node[SIZE_AC + nid], nid); - if (INDEX_AC != INDEX_L3) { - init_list(malloc_sizes[INDEX_L3].cs_cachep, - &initkmem_list3[SIZE_L3 + nid], nid); + if (INDEX_AC != INDEX_NODE) { + init_list(kmalloc_caches[INDEX_NODE], + &init_kmem_cache_node[SIZE_NODE + nid], nid); } } } - g_cpucache_up = EARLY; + create_kmalloc_caches(ARCH_KMALLOC_FLAGS); } void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - g_cpucache_up = LATE; - - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); + slab_state = UP; /* 6) resize the head arrays to their final sizes */ - mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) + mutex_lock(&slab_mutex); + list_for_each_entry(cachep, &slab_caches, list) if (enable_cpucache(cachep, GFP_NOWAIT)) BUG(); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); + + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); /* Done! */ - g_cpucache_up = FULL; + slab_state = FULL; /* * Register a cpu startup notifier callback that initializes @@ -1707,7 +1591,7 @@ void __init kmem_cache_init_late(void) #ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees - * nodelists. + * node. */ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); #endif @@ -1727,6 +1611,9 @@ static int __init cpucache_init(void) */ for_each_online_cpu(cpu) start_cpu_timer(cpu); + + /* Done! */ + slab_state = FULL; return 0; } __initcall(cpucache_init); @@ -1734,8 +1621,8 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { - struct kmem_list3 *l3; - struct slab *slabp; + struct kmem_cache_node *n; + struct page *page; unsigned long flags; int node; @@ -1743,30 +1630,30 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", nodeid, gfpflags); printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", - cachep->name, cachep->buffer_size, cachep->gfporder); + cachep->name, cachep->size, cachep->gfporder); for_each_online_node(node) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; - spin_lock_irqsave(&l3->list_lock, flags); - list_for_each_entry(slabp, &l3->slabs_full, list) { + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_partial, list) { - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_free, list) + list_for_each_entry(page, &n->slabs_free, lru) num_slabs++; - free_objects += l3->free_objects; - spin_unlock_irqrestore(&l3->list_lock, flags); + free_objects += n->free_objects; + spin_unlock_irqrestore(&n->list_lock, flags); num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -1784,21 +1671,13 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct page *page; int nr_pages; - int i; - -#ifndef CONFIG_MMU - /* - * Nommu uses slab's for process anonymous memory allocations, and thus - * requires __GFP_COMP to properly refcount higher order allocations - */ - flags |= __GFP_COMP; -#endif - flags |= cachep->gfpflags; + flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; @@ -1809,6 +1688,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) return NULL; } + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (unlikely(page->pfmemalloc)) + pfmemalloc_active = true; + nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1816,8 +1699,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - for (i = 0; i < nr_pages; i++) - __SetPageSlab(page + i); + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); + memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1828,17 +1713,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) kmemcheck_mark_unallocated_pages(page, nr_pages); } - return page_address(page); + return page; } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); - const unsigned long nr_freed = i; + const unsigned long nr_freed = (1 << cachep->gfporder); kmemcheck_free_shadow(page, cachep->gfporder); @@ -1848,24 +1731,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); - while (i--) { - BUG_ON(!PageSlab(page)); - __ClearPageSlab(page); - page++; - } + + BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page_mapcount_reset(page); + page->mapping = NULL; + + memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_pages((unsigned long)addr, cachep->gfporder); + __free_memcg_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) { - struct slab_rcu *slab_rcu = (struct slab_rcu *)head; - struct kmem_cache *cachep = slab_rcu->cachep; + struct kmem_cache *cachep; + struct page *page; - kmem_freepages(cachep, slab_rcu->addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slab_rcu); + page = container_of(head, struct page, rcu_head); + cachep = page->slab_cache; + + kmem_freepages(cachep, page); } #if DEBUG @@ -1874,7 +1761,7 @@ static void kmem_rcu_free(struct rcu_head *head) static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, unsigned long caller) { - int size = obj_size(cachep); + int size = cachep->object_size; addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; @@ -1906,7 +1793,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) { - int size = obj_size(cachep); + int size = cachep->object_size; addr = &((char *)addr)[obj_offset(cachep)]; memset(addr, val, size); @@ -1959,14 +1846,12 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); - print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); - printk("\n"); + printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + *dbg_userword(cachep, objp), + *dbg_userword(cachep, objp)); } realobj = (char *)objp + obj_offset(cachep); - size = obj_size(cachep); + size = cachep->object_size; for (i = 0; i < size && lines; i += 16, lines--) { int limit; limit = 16; @@ -1983,7 +1868,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int lines = 0; realobj = (char *)objp + obj_offset(cachep); - size = obj_size(cachep); + size = cachep->object_size; for (i = 0; i < size; i++) { char exp = POISON_FREE; @@ -2016,19 +1901,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(objp); unsigned int objnr; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); if (objnr) { - objp = index_to_obj(cachep, slabp, objnr - 1); + objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slabp, objnr + 1); + objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -2039,18 +1924,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->buffer_size % PAGE_SIZE == 0 && + if (cachep->size % PAGE_SIZE == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 1); + cachep->size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -2068,7 +1954,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } } #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { } #endif @@ -2076,52 +1963,42 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed + * @page: page pointer being destroyed * * Destroy all the objs in a slab, and release the mem back to the system. * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct page *page) { - void *addr = slabp->s_mem - slabp->colouroff; + void *freelist; - slab_destroy_debugcheck(cachep, slabp); + freelist = page->freelist; + slab_destroy_debugcheck(cachep, page); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct slab_rcu *slab_rcu; + struct rcu_head *head; + + /* + * RCU free overloads the RCU head over the LRU. + * slab_page has been overloeaded over the LRU, + * however it is not used from now on so that + * we can use it safely. + */ + head = (void *)&page->rcu_head; + call_rcu(head, kmem_rcu_free); - slab_rcu = (struct slab_rcu *)slabp; - slab_rcu->cachep = cachep; - slab_rcu->addr = addr; - call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_freepages(cachep, page); } -} - -static void __kmem_cache_destroy(struct kmem_cache *cachep) -{ - int i; - struct kmem_list3 *l3; - - for_each_online_cpu(i) - kfree(cachep->array[i]); - /* NUMA: free the list3 structures */ - for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - } - } - kmem_cache_free(&cache_cache, cachep); + /* + * From now on, we don't use freelist + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->freelist_cache, freelist); } - /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2150,14 +2027,18 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, if (!num) continue; + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + if (flags & CFLGS_OFF_SLAB) { /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + offslab_limit = size; + offslab_limit /= sizeof(freelist_idx_t); if (num > offslab_limit) break; @@ -2194,48 +2075,57 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) { - if (g_cpucache_up == FULL) + if (slab_state >= FULL) return enable_cpucache(cachep, gfp); - if (g_cpucache_up == NONE) { + if (slab_state == DOWN) { + /* + * Note: Creation of first cache (kmem_cache). + * The setup_node is taken care + * of by the caller of __kmem_cache_create + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + slab_state = PARTIAL; + } else if (slab_state == PARTIAL) { /* - * Note: the first kmem_cache_create must create the cache + * Note: the second kmem_cache_create must create the cache * that's used by kmalloc(24), otherwise the creation of * further caches will BUG(). */ cachep->array[smp_processor_id()] = &initarray_generic.cache; /* - * If the cache that's used by kmalloc(sizeof(kmem_list3)) is - * the first cache, then we need to set up all its list3s, + * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is + * the second cache, then we need to set up all its node/, * otherwise the creation of further caches will BUG(). */ - set_up_list3s(cachep, SIZE_AC); - if (INDEX_AC == INDEX_L3) - g_cpucache_up = PARTIAL_L3; + set_up_node(cachep, SIZE_AC); + if (INDEX_AC == INDEX_NODE) + slab_state = PARTIAL_NODE; else - g_cpucache_up = PARTIAL_AC; + slab_state = PARTIAL_ARRAYCACHE; } else { + /* Remaining boot caches */ cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init), gfp); - if (g_cpucache_up == PARTIAL_AC) { - set_up_list3s(cachep, SIZE_L3); - g_cpucache_up = PARTIAL_L3; + if (slab_state == PARTIAL_ARRAYCACHE) { + set_up_node(cachep, SIZE_NODE); + slab_state = PARTIAL_NODE; } else { int node; for_each_online_node(node) { - cachep->nodelists[node] = - kmalloc_node(sizeof(struct kmem_list3), + cachep->node[node] = + kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep->nodelists[node]); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); } } } - cachep->nodelists[numa_mem_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + cachep->node[numa_mem_id()]->next_reap = + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; cpu_cache_get(cachep)->avail = 0; cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; @@ -2247,20 +2137,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } /** - * kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. + * __kmem_cache_create - Create a cache. + * @cachep: cache management descriptor * @flags: SLAB flags - * @ctor: A constructor for the objects. * * Returns a ptr to the cache on success, NULL on failure. * Cannot be called within a int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * - * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting unloaded. - * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -2273,60 +2157,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -struct kmem_cache * -kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +int +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, slab_size, ralign; - struct kmem_cache *cachep = NULL, *pc; + size_t left_over, freelist_size, ralign; gfp_t gfp; - - /* - * Sanity checks... these are all serious usage bugs. - */ - if (!name || in_interrupt() || (size < BYTES_PER_WORD) || - size > KMALLOC_MAX_SIZE) { - printk(KERN_ERR "%s: Early error in slab %s\n", __func__, - name); - BUG(); - } - - /* - * We use cache_chain_mutex to ensure a consistent view of - * cpu_online_mask as well. Please see cpuup_callback - */ - if (slab_is_available()) { - get_online_cpus(); - mutex_lock(&cache_chain_mutex); - } - - list_for_each_entry(pc, &cache_chain, next) { - char tmp; - int res; - - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(pc->name, tmp); - if (res) { - printk(KERN_ERR - "SLAB: cache with size %d has lost its name\n", - pc->buffer_size); - continue; - } - - if (!strcmp(pc->name, name)) { - printk(KERN_ERR - "kmem_cache_create: duplicate cache %s\n", name); - dump_stack(); - goto oops; - } - } + int err; + size_t size = cachep->size; #if DEBUG - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2343,11 +2182,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif - /* - * Always checks flags, a caller might be expecting debug support which - * isn't available. - */ - BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid @@ -2359,22 +2193,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, size &= ~(BYTES_PER_WORD - 1); } - /* calculate the final buffer alignment: */ - - /* 1) arch recommendation: can be overridden for debug */ - if (flags & SLAB_HWCACHE_ALIGN) { - /* - * Default alignment: as specified by the arch code. Except if - * an object is really small, then squeeze multiple objects into - * one cacheline. - */ - ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - } else { - ralign = BYTES_PER_WORD; - } - /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated @@ -2391,13 +2209,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, size &= ~(REDZONE_ALIGN - 1); } - /* 2) arch mandated alignment */ - if (ralign < ARCH_SLAB_MINALIGN) { - ralign = ARCH_SLAB_MINALIGN; - } /* 3) caller mandated alignment */ - if (ralign < align) { - ralign = align; + if (ralign < cachep->align) { + ralign = cachep->align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) @@ -2405,21 +2219,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * 4) Store it. */ - align = ralign; + cachep->align = ralign; if (slab_is_available()) gfp = GFP_KERNEL; else gfp = GFP_NOWAIT; - /* Get cache's description obj. */ - cachep = kmem_cache_zalloc(&cache_cache, gfp); - if (!cachep) - goto oops; - - cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; + setup_node_pointer(cachep); #if DEBUG - cachep->obj_size = size; /* * Both debugging options require word-alignment which is calculated @@ -2441,9 +2249,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); + if (size >= kmalloc_size(INDEX_NODE + 1) + && cachep->object_size > cache_line_size() + && ALIGN(size, cachep->align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } #endif @@ -2455,7 +2264,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2463,33 +2272,34 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, align); + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, align, flags); + left_over = calculate_slab_order(cachep, size, cachep->align, flags); - if (!cachep->num) { - printk(KERN_ERR - "kmem_cache_create: couldn't create cache %s.\n", name); - kmem_cache_free(&cache_cache, cachep); - cachep = NULL; - goto oops; - } - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), align); + if (!cachep->num) + return -E2BIG; + + freelist_size = + ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ - if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { flags &= ~CFLGS_OFF_SLAB; - left_over -= slab_size; + left_over -= freelist_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + freelist_size = cachep->num * sizeof(freelist_idx_t); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2503,35 +2313,33 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < align) - cachep->colour_off = align; + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; - cachep->slab_size = slab_size; + cachep->freelist_size = freelist_size; cachep->flags = flags; - cachep->gfpflags = 0; + cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) - cachep->gfpflags |= GFP_DMA; - cachep->buffer_size = size; + cachep->allocflags |= GFP_DMA; + cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); + cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* - * This is a possibility for one of the malloc_sizes caches. + * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, - * this should not happen at all. + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); + BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } - cachep->ctor = ctor; - cachep->name = name; - if (setup_cpu_cache(cachep, gfp)) { - __kmem_cache_destroy(cachep); - cachep = NULL; - goto oops; + err = setup_cpu_cache(cachep, gfp); + if (err) { + __kmem_cache_shutdown(cachep); + return err; } if (flags & SLAB_DEBUG_OBJECTS) { @@ -2542,21 +2350,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); slab_set_debugobj_lock_classes(cachep); - } + } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) + on_slab_lock_classes(cachep); - /* cache setup completed, link it into the list */ - list_add(&cachep->next, &cache_chain); -oops: - if (!cachep && (flags & SLAB_PANIC)) - panic("kmem_cache_create(): failed to create slab `%s'\n", - name); - if (slab_is_available()) { - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); - } - return cachep; + return 0; } -EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) @@ -2573,7 +2371,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); + assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); #endif } @@ -2581,7 +2379,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[node]->list_lock); + assert_spin_locked(&cachep->node[node]->list_lock); #endif } @@ -2592,7 +2390,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node); @@ -2604,29 +2402,29 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->nodelists[node]->list_lock); + spin_lock(&cachep->node[node]->list_lock); free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->nodelists[node]->list_lock); + spin_unlock(&cachep->node[node]->list_lock); ac->avail = 0; } static void drain_cpu_caches(struct kmem_cache *cachep) { - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node; on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (l3 && l3->alien) - drain_alien_cache(cachep, l3->alien); + n = cachep->node[node]; + if (n && n->alien) + drain_alien_cache(cachep, n->alien); } for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (l3) - drain_array(cachep, l3, l3->shared, 1, node); + n = cachep->node[node]; + if (n) + drain_array(cachep, n, n->shared, 1, node); } } @@ -2637,58 +2435,58 @@ static void drain_cpu_caches(struct kmem_cache *cachep) * Returns the actual number of slabs released. */ static int drain_freelist(struct kmem_cache *cache, - struct kmem_list3 *l3, int tofree) + struct kmem_cache_node *n, int tofree) { struct list_head *p; int nr_freed; - struct slab *slabp; + struct page *page; nr_freed = 0; - while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&l3->list_lock); - p = l3->slabs_free.prev; - if (p == &l3->slabs_free) { - spin_unlock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { + spin_unlock_irq(&n->list_lock); goto out; } - slabp = list_entry(p, struct slab, list); + page = list_entry(p, struct page, lru); #if DEBUG - BUG_ON(slabp->inuse); + BUG_ON(page->active); #endif - list_del(&slabp->list); + list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ - l3->free_objects -= cache->num; - spin_unlock_irq(&l3->list_lock); - slab_destroy(cache, slabp); + n->free_objects -= cache->num; + spin_unlock_irq(&n->list_lock); + slab_destroy(cache, page); nr_freed++; } out: return nr_freed; } -/* Called with cache_chain_mutex held to protect against cpu hotplug */ +/* Called with slab_mutex held to protect against cpu hotplug */ static int __cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; - struct kmem_list3 *l3; + struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (!l3) + n = cachep->node[i]; + if (!n) continue; - drain_freelist(cachep, l3, l3->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); - ret += !list_empty(&l3->slabs_full) || - !list_empty(&l3->slabs_partial); + ret += !list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial); } return (ret ? 1 : 0); } @@ -2706,113 +2504,92 @@ int kmem_cache_shrink(struct kmem_cache *cachep) BUG_ON(!cachep || in_interrupt()); get_online_cpus(); - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); ret = __cache_shrink(cachep); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); put_online_cpus(); return ret; } EXPORT_SYMBOL(kmem_cache_shrink); -/** - * kmem_cache_destroy - delete a cache - * @cachep: the cache to destroy - * - * Remove a &struct kmem_cache object from the slab cache. - * - * It is expected this function will be called by a module when it is - * unloaded. This will remove the cache completely, and avoid a duplicate - * cache being allocated each time a module is loaded and unloaded, if the - * module doesn't have persistent in-kernel storage across loads and unloads. - * - * The cache must be empty before calling this function. - * - * The caller must guarantee that no one will allocate memory from the cache - * during the kmem_cache_destroy(). - */ -void kmem_cache_destroy(struct kmem_cache *cachep) +int __kmem_cache_shutdown(struct kmem_cache *cachep) { - BUG_ON(!cachep || in_interrupt()); + int i; + struct kmem_cache_node *n; + int rc = __cache_shrink(cachep); - /* Find the cache in the chain of caches. */ - get_online_cpus(); - mutex_lock(&cache_chain_mutex); - /* - * the chain is never empty, cache_cache is never destroyed - */ - list_del(&cachep->next); - if (__cache_shrink(cachep)) { - slab_error(cachep, "Can't free all objects"); - list_add(&cachep->next, &cache_chain); - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); - return; - } + if (rc) + return rc; - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - rcu_barrier(); + for_each_online_cpu(i) + kfree(cachep->array[i]); - __kmem_cache_destroy(cachep); - mutex_unlock(&cache_chain_mutex); - put_online_cpus(); + /* NUMA: free the node structures */ + for_each_online_node(i) { + n = cachep->node[i]; + if (n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + } + } + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); /* * Get the memory for a slab management obj. - * For a slab cache when the slab descriptor is off-slab, slab descriptors - * always come from malloc_sizes caches. The slab descriptor cannot - * come from the same cache which is getting created because, - * when we are searching for an appropriate cache for these - * descriptors in kmem_cache_create, we search through the malloc_sizes array. - * If we are creating a malloc_sizes cache here it would not be visible to - * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have slabp_cache same as the original cache. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, - int colour_off, gfp_t local_flags, - int nodeid) +static void *alloc_slabmgmt(struct kmem_cache *cachep, + struct page *page, int colour_off, + gfp_t local_flags, int nodeid) { - struct slab *slabp; + void *freelist; + void *addr = page_address(page); if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - slabp = kmem_cache_alloc_node(cachep->slabp_cache, + freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - /* - * If the first object in the slab is leaked (it's allocated - * but no one has a reference to it), we want to make sure - * kmemleak does not treat the ->s_mem pointer as a reference - * to the object. Otherwise we will not report the leak. - */ - kmemleak_scan_area(&slabp->list, sizeof(struct list_head), - local_flags); - if (!slabp) + if (!freelist) return NULL; } else { - slabp = objp + colour_off; - colour_off += cachep->slab_size; + freelist = addr + colour_off; + colour_off += cachep->freelist_size; } - slabp->inuse = 0; - slabp->colouroff = colour_off; - slabp->s_mem = objp + colour_off; - slabp->nodeid = nodeid; - slabp->free = 0; - return slabp; + page->active = 0; + page->s_mem = addr + colour_off; + return freelist; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) { - return (kmem_bufctl_t *) (slabp + 1); + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned int idx, freelist_idx_t val) +{ + ((freelist_idx_t *)(page->freelist))[idx] = val; } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp) + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2840,64 +2617,63 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->buffer_size % PAGE_SIZE) == 0 && + if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 0); + cachep->size / PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(slabp)[i] = i + 1; + set_free_obj(page, i, i); } - slab_bufctl(slabp)[i - 1] = BUFCTL_END; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { if (CONFIG_ZONE_DMA_FLAG) { if (flags & GFP_DMA) - BUG_ON(!(cachep->gfpflags & GFP_DMA)); + BUG_ON(!(cachep->allocflags & GFP_DMA)); else - BUG_ON(cachep->gfpflags & GFP_DMA); + BUG_ON(cachep->allocflags & GFP_DMA); } } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, int nodeid) { - void *objp = index_to_obj(cachep, slabp, slabp->free); - kmem_bufctl_t next; + void *objp; - slabp->inuse++; - next = slab_bufctl(slabp)[slabp->free]; + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); + page->active++; #if DEBUG - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif - slabp->free = next; return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, +static void slab_put_obj(struct kmem_cache *cachep, struct page *page, void *objp, int nodeid) { - unsigned int objnr = obj_to_index(cachep, slabp, objp); - + unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG + unsigned int i; + /* Verify that the slab belongs to the intended node */ - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); - BUG(); + /* Verify double free bug */ + for (i = page->active; i < cachep->num; i++) { + if (get_free_obj(page, i) == objnr) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } } #endif - slab_bufctl(slabp)[objnr] = slabp->free; - slabp->free = objnr; - slabp->inuse--; + page->active--; + set_free_obj(page, page->active, objnr); } /* @@ -2905,23 +2681,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, * for the slab allocator to be able to lookup the cache and slab of a * virtual address for kfree, ksize, and slab debugging. */ -static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, - void *addr) +static void slab_map_pages(struct kmem_cache *cache, struct page *page, + void *freelist) { - int nr_pages; - struct page *page; - - page = virt_to_page(addr); - - nr_pages = 1; - if (likely(!PageCompound(page))) - nr_pages <<= cache->gfporder; - - do { - page_set_cache(page, cache); - page_set_slab(page, slab); - page++; - } while (--nr_pages); + page->slab_cache = cache; + page->freelist = freelist; } /* @@ -2929,12 +2693,12 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) + gfp_t flags, int nodeid, struct page *page) { - struct slab *slabp; + void *freelist; size_t offset; gfp_t local_flags; - struct kmem_list3 *l3; + struct kmem_cache_node *n; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2943,17 +2707,17 @@ static int cache_grow(struct kmem_cache *cachep, BUG_ON(flags & GFP_SLAB_BUG_MASK); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the l3 list lock to change the colour_next on this node */ + /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - l3 = cachep->nodelists[nodeid]; - spin_lock(&l3->list_lock); + n = cachep->node[nodeid]; + spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ - offset = l3->colour_next; - l3->colour_next++; - if (l3->colour_next >= cachep->colour) - l3->colour_next = 0; - spin_unlock(&l3->list_lock); + offset = n->colour_next; + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + spin_unlock(&n->list_lock); offset *= cachep->colour_off; @@ -2972,34 +2736,34 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); - if (!objp) + if (!page) + page = kmem_getpages(cachep, local_flags, nodeid); + if (!page) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, + freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!slabp) + if (!freelist) goto opps1; - slab_map_pages(cachep, slabp, objp); + slab_map_pages(cachep, page, freelist); - cache_init_objs(cachep, slabp); + cache_init_objs(cachep, page); if (local_flags & __GFP_WAIT) local_irq_disable(); check_irq_off(); - spin_lock(&l3->list_lock); + spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&slabp->list, &(l3->slabs_free)); + list_add_tail(&page->lru, &(n->slabs_free)); STATS_INC_GROWN(cachep); - l3->free_objects += cachep->num; - spin_unlock(&l3->list_lock); + n->free_objects += cachep->num; + spin_unlock(&n->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, page); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -3045,11 +2809,10 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { - struct page *page; unsigned int objnr; - struct slab *slabp; + struct page *page; BUG_ON(virt_to_cache(objp) != cachep); @@ -3057,30 +2820,25 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_head_page(objp); - slabp = page_get_slab(page); - if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); + BUG_ON(objp != index_to_obj(cachep, page, objnr)); -#ifdef CONFIG_DEBUG_SLAB_LEAK - slab_bufctl(slabp)[objnr] = BUFCTL_FREE; -#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, (unsigned long)caller); + if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, caller); kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 0); + cachep->size / PAGE_SIZE, 0); } else { poison_obj(cachep, objp, POISON_FREE); } @@ -3091,45 +2849,24 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, return objp; } -static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) -{ - kmem_bufctl_t i; - int entries = 0; - - /* Check slab's freelist to see if this obj is there. */ - for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { - entries++; - if (entries > cachep->num || i >= cachep->num) - goto bad; - } - if (entries != cachep->num - slabp->inuse) { -bad: - printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse, - print_tainted()); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, - sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), - 1); - BUG(); - } -} #else #define kfree_debugcheck(x) do { } while(0) #define cache_free_debugcheck(x,objp,z) (objp) -#define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, + bool force_refill) { int batchcount; - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct array_cache *ac; int node; -retry: check_irq_off(); node = numa_mem_id(); + if (unlikely(force_refill)) + goto force_grow; +retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3140,31 +2877,30 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - l3 = cachep->nodelists[node]; + n = cachep->node[node]; - BUG_ON(ac->avail > 0 || !l3); - spin_lock(&l3->list_lock); + BUG_ON(ac->avail > 0 || !n); + spin_lock(&n->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { - l3->shared->touched = 1; + if (n->shared && transfer_objects(ac, n->shared, batchcount)) { + n->shared->touched = 1; goto alloc_done; } while (batchcount > 0) { struct list_head *entry; - struct slab *slabp; + struct page *page; /* Get slab alloc is to come from. */ - entry = l3->slabs_partial.next; - if (entry == &l3->slabs_partial) { - l3->free_touched = 1; - entry = l3->slabs_free.next; - if (entry == &l3->slabs_free) + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) goto must_grow; } - slabp = list_entry(entry, struct slab, list); - check_slabp(cachep, slabp); + page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -3172,45 +2908,49 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse >= cachep->num); + BUG_ON(page->active >= cachep->num); - while (slabp->inuse < cachep->num && batchcount--) { + while (page->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac_put_obj(cachep, ac, slab_get_obj(cachep, page, + node)); } - check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ - list_del(&slabp->list); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &l3->slabs_full); + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); else - list_add(&slabp->list, &l3->slabs_partial); + list_add(&page->lru, &n->slabs_partial); } must_grow: - l3->free_objects -= ac->avail; + n->free_objects -= ac->avail; alloc_done: - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); if (unlikely(!ac->avail)) { int x; +force_grow: x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); - if (!x && ac->avail == 0) /* no objects in sight? abort */ + node = numa_mem_id(); + + /* no objects in sight? abort */ + if (!x && (ac->avail == 0 || force_refill)) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; - return ac->entry[--ac->avail]; + + return ac_get_obj(cachep, ac, flags, force_refill); } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -3224,15 +2964,15 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, #if DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, - gfp_t flags, void *objp, void *caller) + gfp_t flags, void *objp, unsigned long caller) { if (!objp) return objp; if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, 1); + cachep->size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -3241,7 +2981,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) - *dbg_userword(cachep, objp) = caller; + *dbg_userword(cachep, objp) = (void *)caller; if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || @@ -3256,16 +2996,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } -#ifdef CONFIG_DEBUG_SLAB_LEAK - { - struct slab *slabp; - unsigned objnr; - - slabp = page_get_slab(virt_to_head_page(objp)); - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; - slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; - } -#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3282,33 +3012,45 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == &cache_cache) + if (cachep == kmem_cache) return false; - return should_failslab(obj_size(cachep), flags, cachep->flags); + return should_failslab(cachep->object_size, flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; + bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { - STATS_INC_ALLOCHIT(cachep); ac->touched = 1; - objp = ac->entry[--ac->avail]; - } else { - STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = ac_get_obj(cachep, ac, flags, false); + /* - * the 'ac' may be updated by cache_alloc_refill(), - * and kmemleak_erase() requires its correct value. + * Allow for the possibility all avail objects are not allowed + * by the current flags */ - ac = cpu_cache_get(cachep); + if (objp) { + STATS_INC_ALLOCHIT(cachep); + goto out; + } + force_refill = true; } + + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags, force_refill); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); + +out: /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't @@ -3321,7 +3063,7 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3336,7 +3078,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(current->mempolicy); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3345,7 +3087,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) /* * Fallback function if there was no memory available and no objects on a * certain node and fall back is permitted. First we scan all the - * available nodelists for available objects. If that fails then we + * available node for available objects. If that fails then we * perform an allocation without specifying a node. This allows the page * allocator to do its reclaim / fallback magic. We then insert the * slab into the proper nodelist and then allocate from it. @@ -3367,8 +3109,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* @@ -3379,8 +3121,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->nodelists[nid] && - cache->nodelists[nid]->free_objects) { + cache->node[nid] && + cache->node[nid]->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3395,18 +3137,20 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ + struct page *page; + if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_mem_id()); + page = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); - if (obj) { + if (page) { /* * Insert into the appropriate per node queues */ - nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + nid = page_to_nid(page); + if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) @@ -3423,7 +3167,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } @@ -3435,51 +3179,50 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct slab *slabp; - struct kmem_list3 *l3; + struct page *page; + struct kmem_cache_node *n; void *obj; int x; - l3 = cachep->nodelists[nodeid]; - BUG_ON(!l3); + VM_BUG_ON(nodeid > num_online_nodes()); + n = cachep->node[nodeid]; + BUG_ON(!n); retry: check_irq_off(); - spin_lock(&l3->list_lock); - entry = l3->slabs_partial.next; - if (entry == &l3->slabs_partial) { - l3->free_touched = 1; - entry = l3->slabs_free.next; - if (entry == &l3->slabs_free) + spin_lock(&n->list_lock); + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) goto must_grow; } - slabp = list_entry(entry, struct slab, list); + page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); - check_slabp(cachep, slabp); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slabp->inuse == cachep->num); + BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, slabp, nodeid); - check_slabp(cachep, slabp); - l3->free_objects--; + obj = slab_get_obj(cachep, page, nodeid); + n->free_objects--; /* move slabp to correct slabp list: */ - list_del(&slabp->list); + list_del(&page->lru); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &l3->slabs_full); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); else - list_add(&slabp->list, &l3->slabs_partial); + list_add(&page->lru, &n->slabs_partial); - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); goto done; must_grow: - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); if (x) goto retry; @@ -3490,21 +3233,9 @@ done: return obj; } -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * @caller: return address of caller, used for debug information - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - */ static __always_inline void * -__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, - void *caller) +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + unsigned long caller) { unsigned long save_flags; void *ptr; @@ -3517,13 +3248,15 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->nodelists[nodeid])) { + if (unlikely(!cachep->node[nodeid])) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3545,14 +3278,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, + kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, flags); - if (likely(ptr)) - kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); - - if (unlikely((flags & __GFP_ZERO) && ptr)) - memset(ptr, 0, obj_size(cachep)); + if (likely(ptr)) { + kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } return ptr; } @@ -3562,7 +3295,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; @@ -3590,7 +3323,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3602,67 +3335,70 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) if (slab_should_failslab(cachep, flags)) return NULL; + cachep = memcg_kmem_get_cache(cachep, flags); + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, + kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, flags); prefetchw(objp); - if (likely(objp)) - kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); - - if (unlikely((flags & __GFP_ZERO) && objp)) - memset(objp, 0, obj_size(cachep)); + if (likely(objp)) { + kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } return objp; } /* - * Caller needs to acquire correct kmem_list's list_lock + * Caller needs to acquire correct kmem_cache_node's list_lock */ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, int node) { int i; - struct kmem_list3 *l3; + struct kmem_cache_node *n; for (i = 0; i < nr_objects; i++) { - void *objp = objpp[i]; - struct slab *slabp; + void *objp; + struct page *page; - slabp = virt_to_slab(objp); - l3 = cachep->nodelists[node]; - list_del(&slabp->list); + clear_obj_pfmemalloc(&objpp[i]); + objp = objpp[i]; + + page = virt_to_head_page(objp); + n = cachep->node[node]; + list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - check_slabp(cachep, slabp); - slab_put_obj(cachep, slabp, objp, node); + slab_put_obj(cachep, page, objp, node); STATS_DEC_ACTIVE(cachep); - l3->free_objects++; - check_slabp(cachep, slabp); + n->free_objects++; /* fixup slab chains */ - if (slabp->inuse == 0) { - if (l3->free_objects > l3->free_limit) { - l3->free_objects -= cachep->num; + if (page->active == 0) { + if (n->free_objects > n->free_limit) { + n->free_objects -= cachep->num; /* No need to drop any previously held * lock here, even if we have a off-slab slab * descriptor it is guaranteed to come from * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, page); } else { - list_add(&slabp->list, &l3->slabs_free); + list_add(&page->lru, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &l3->slabs_partial); + list_add_tail(&page->lru, &n->slabs_partial); } } } @@ -3670,7 +3406,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node = numa_mem_id(); batchcount = ac->batchcount; @@ -3678,10 +3414,10 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - l3 = cachep->nodelists[node]; - spin_lock(&l3->list_lock); - if (l3->shared) { - struct array_cache *shared_array = l3->shared; + n = cachep->node[node]; + spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; if (max) { if (batchcount > max) @@ -3700,12 +3436,12 @@ free_done: int i = 0; struct list_head *p; - p = l3->slabs_free.next; - while (p != &(l3->slabs_free)) { - struct slab *slabp; + p = n->slabs_free.next; + while (p != &(n->slabs_free)) { + struct page *page; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); + page = list_entry(p, struct page, lru); + BUG_ON(page->active); i++; p = p->next; @@ -3713,7 +3449,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&l3->list_lock); + spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3723,7 +3459,7 @@ free_done: * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp, - void *caller) + unsigned long caller) { struct array_cache *ac = cpu_cache_get(cachep); @@ -3731,7 +3467,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - kmemcheck_slab_free(cachep, objp, obj_size(cachep)); + kmemcheck_slab_free(cachep, objp, cachep->object_size); /* * Skip calling cache_free_alien() when the platform is not numa. @@ -3750,7 +3486,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac->entry[ac->avail++] = objp; + ac_put_obj(cachep, ac, objp); } /** @@ -3763,10 +3499,10 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, - obj_size(cachep), cachep->buffer_size, flags); + cachep->object_size, cachep->size, flags); return ret; } @@ -3774,27 +3510,37 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void * -kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) +kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) { void *ret; - ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + ret = slab_alloc(cachep, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, - size, slab_buffer_size(cachep), flags); + size, cachep->size, flags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif #ifdef CONFIG_NUMA +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, - obj_size(cachep), cachep->buffer_size, + cachep->object_size, cachep->size, flags, nodeid); return ret; @@ -3802,17 +3548,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(size_t size, - struct kmem_cache *cachep, +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, gfp_t flags, - int nodeid) + int nodeid, + size_t size) { void *ret; - ret = __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + trace_kmalloc_node(_RET_IP_, ret, - size, slab_buffer_size(cachep), + size, cachep->size, flags, nodeid); return ret; } @@ -3820,34 +3566,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) +__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; - cachep = kmem_find_general_cachep(size, flags); + cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(size, cachep, flags, node); + return kmem_cache_alloc_node_trace(cachep, flags, node, size); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, - __builtin_return_address(0)); + return __do_kmalloc_node(size, flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, (void *)caller); + return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else void *__kmalloc_node(size_t size, gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, NULL); + return __do_kmalloc_node(size, flags, node, 0); } EXPORT_SYMBOL(__kmalloc_node); #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ @@ -3860,23 +3605,18 @@ EXPORT_SYMBOL(__kmalloc_node); * @caller: function caller for debug tracking of the caller */ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - void *caller) + unsigned long caller) { struct kmem_cache *cachep; void *ret; - /* If you want to save a few bytes .text space: replace - * __ with kmem_. - * Then kmalloc uses the uninlined functions instead of the inline - * functions. - */ - cachep = __find_general_cachep(size, flags); + cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = __cache_alloc(cachep, flags, caller); + ret = slab_alloc(cachep, flags, caller); - trace_kmalloc((unsigned long) caller, ret, - size, cachep->buffer_size, flags); + trace_kmalloc(caller, ret, + size, cachep->size, flags); return ret; } @@ -3885,20 +3625,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, __builtin_return_address(0)); + return __do_kmalloc(size, flags, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, (void *)caller); + return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); #else void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, NULL); + return __do_kmalloc(size, flags, 0); } EXPORT_SYMBOL(__kmalloc); #endif @@ -3914,12 +3654,15 @@ EXPORT_SYMBOL(__kmalloc); void kmem_cache_free(struct kmem_cache *cachep, void *objp) { unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; local_irq_save(flags); - debug_check_no_locks_freed(objp, obj_size(cachep)); + debug_check_no_locks_freed(objp, cachep->object_size); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, obj_size(cachep)); - __cache_free(cachep, objp, __builtin_return_address(0)); + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); trace_kmem_cache_free(_RET_IP_, objp); @@ -3947,26 +3690,21 @@ void kfree(const void *objp) local_irq_save(flags); kfree_debugcheck(objp); c = virt_to_cache(objp); - debug_check_no_locks_freed(objp, obj_size(c)); - debug_check_no_obj_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp, __builtin_return_address(0)); + debug_check_no_locks_freed(objp, c->object_size); + + debug_check_no_obj_freed(objp, c->object_size); + __cache_free(c, (void *)objp, _RET_IP_); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); -unsigned int kmem_cache_size(struct kmem_cache *cachep) -{ - return obj_size(cachep); -} -EXPORT_SYMBOL(kmem_cache_size); - /* - * This initializes kmem_list3 or resizes various caches for all nodes. + * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) { int node; - struct kmem_list3 *l3; + struct kmem_cache_node *n; struct array_cache *new_shared; struct array_cache **new_alien = NULL; @@ -3989,58 +3727,58 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) } } - l3 = cachep->nodelists[node]; - if (l3) { - struct array_cache *shared = l3->shared; + n = cachep->node[node]; + if (n) { + struct array_cache *shared = n->shared; - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, shared->avail, node); - l3->shared = new_shared; - if (!l3->alien) { - l3->alien = new_alien; + n->shared = new_shared; + if (!n->alien) { + n->alien = new_alien; new_alien = NULL; } - l3->free_limit = (1 + nr_cpus_node(node)) * + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(new_alien); continue; } - l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); - if (!l3) { + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) { free_alien_cache(new_alien); kfree(new_shared); goto fail; } - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - l3->shared = new_shared; - l3->alien = new_alien; - l3->free_limit = (1 + nr_cpus_node(node)) * + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + n->shared = new_shared; + n->alien = new_alien; + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - cachep->nodelists[node] = l3; + cachep->node[node] = n; } return 0; fail: - if (!cachep->next.next) { + if (!cachep->list.next) { /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->nodelists[node]) { - l3 = cachep->nodelists[node]; + if (cachep->node[node]) { + n = cachep->node[node]; - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - cachep->nodelists[node] = NULL; + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[node] = NULL; } node--; } @@ -4065,8 +3803,8 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } -/* Always called with the cache_chain_mutex held */ -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, +/* Always called with the slab_mutex held */ +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { struct ccupdate_struct *new; @@ -4100,21 +3838,58 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); - return alloc_kmemlist(cachep, gfp); + return alloc_kmem_cache_node(cachep, gfp); +} + +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c = NULL; + int i = 0; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + VM_BUG_ON(!mutex_is_locked(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(cachep, i); + if (c) + /* return value determined by the parent cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; } -/* Called with cache_chain_mutex held always */ +/* Called with slab_mutex held always */ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) { int err; - int limit, shared; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + if (limit && shared && batchcount) + goto skip_setup; /* * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm @@ -4124,13 +3899,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * The numbers are guessed, we should auto-tune as described by * Bonwick. */ - if (cachep->buffer_size > 131072) + if (cachep->size > 131072) limit = 1; - else if (cachep->buffer_size > PAGE_SIZE) + else if (cachep->size > PAGE_SIZE) limit = 8; - else if (cachep->buffer_size > 1024) + else if (cachep->size > 1024) limit = 24; - else if (cachep->buffer_size > 256) + else if (cachep->size > 256) limit = 54; else limit = 120; @@ -4145,7 +3920,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * to a larger limit. Thus disabled by default. */ shared = 0; - if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) + if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; #if DEBUG @@ -4156,7 +3931,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) if (limit > 32) limit = 32; #endif - err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); @@ -4164,11 +3941,11 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) } /* - * Drain an array if it contains any elements taking the l3 lock only if - * necessary. Note that the l3 listlock also protects the array_cache + * Drain an array if it contains any elements taking the node lock only if + * necessary. Note that the node listlock also protects the array_cache * if drain_array() is used on the shared array. */ -static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { int tofree; @@ -4178,7 +3955,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, if (ac->touched && !force) { ac->touched = 0; } else { - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) @@ -4188,7 +3965,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); } } @@ -4207,45 +3984,45 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; - struct kmem_list3 *l3; + struct kmem_cache_node *n; int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); - if (!mutex_trylock(&cache_chain_mutex)) + if (!mutex_trylock(&slab_mutex)) /* Give up. Setup the next iteration. */ goto out; - list_for_each_entry(searchp, &cache_chain, next) { + list_for_each_entry(searchp, &slab_caches, list) { check_irq_on(); /* - * We only take the l3 lock if absolutely necessary and we + * We only take the node lock if absolutely necessary and we * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - l3 = searchp->nodelists[node]; + n = searchp->node[node]; - reap_alien(searchp, l3); + reap_alien(searchp, n); - drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), 0, node); /* * These are racy checks but it does not matter * if we skip one check or scan twice. */ - if (time_after(l3->next_reap, jiffies)) + if (time_after(n->next_reap, jiffies)) goto next; - l3->next_reap = jiffies + REAPTIMEOUT_LIST3; + n->next_reap = jiffies + REAPTIMEOUT_NODE; - drain_array(searchp, l3, l3->shared, 0, node); + drain_array(searchp, n, n->shared, 0, node); - if (l3->free_touched) - l3->free_touched = 0; + if (n->free_touched) + n->free_touched = 0; else { int freed; - freed = drain_freelist(searchp, l3, (l3->free_limit + + freed = drain_freelist(searchp, n, (n->free_limit + 5 * searchp->num - 1) / (5 * searchp->num)); STATS_ADD_REAPED(searchp, freed); } @@ -4253,63 +4030,17 @@ next: cond_resched(); } check_irq_on(); - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } #ifdef CONFIG_SLABINFO - -static void print_slabinfo_header(struct seq_file *m) -{ - /* - * Output format version, so at least we can change it - * without _too_ many complaints. - */ -#if STATS - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); -#else - seq_puts(m, "slabinfo - version: 2.1\n"); -#endif - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); -#if STATS - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " - "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); - seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); -#endif - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - mutex_lock(&cache_chain_mutex); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&cache_chain, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &cache_chain, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&cache_chain_mutex); -} - -static int s_show(struct seq_file *m, void *p) +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); - struct slab *slabp; + struct page *page; unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs = 0; @@ -4317,42 +4048,42 @@ static int s_show(struct seq_file *m, void *p) const char *name; char *error = NULL; int node; - struct kmem_list3 *l3; + struct kmem_cache_node *n; active_objs = 0; num_slabs = 0; for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &l3->slabs_full, list) { - if (slabp->inuse != cachep->num && !error) + list_for_each_entry(page, &n->slabs_full, lru) { + if (page->active != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_partial, list) { - if (slabp->inuse == cachep->num && !error) - error = "slabs_partial inuse accounting error"; - if (!slabp->inuse && !error) - error = "slabs_partial/inuse accounting error"; - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + if (page->active == cachep->num && !error) + error = "slabs_partial accounting error"; + if (!page->active && !error) + error = "slabs_partial accounting error"; + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &l3->slabs_free, list) { - if (slabp->inuse && !error) - error = "slabs_free/inuse accounting error"; + list_for_each_entry(page, &n->slabs_free, lru) { + if (page->active && !error) + error = "slabs_free accounting error"; num_slabs++; } - free_objects += l3->free_objects; - if (l3->shared) - shared_avail += l3->shared->avail; + free_objects += n->free_objects; + if (n->shared) + shared_avail += n->shared->avail; - spin_unlock_irq(&l3->list_lock); + spin_unlock_irq(&n->list_lock); } num_slabs += active_slabs; num_objs = num_slabs * cachep->num; @@ -4363,15 +4094,22 @@ static int s_show(struct seq_file *m, void *p) if (error) printk(KERN_ERR "slab: cache %s error: %s\n", name, error); - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", - name, active_objs, num_objs, cachep->buffer_size, - cachep->num, (1 << cachep->gfporder)); - seq_printf(m, " : tunables %4u %4u %4u", - cachep->limit, cachep->batchcount, cachep->shared); - seq_printf(m, " : slabdata %6lu %6lu %6lu", - active_slabs, num_slabs, shared_avail); + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = num_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ #if STATS - { /* list3 stats */ + { /* node stats */ unsigned long high = cachep->high_mark; unsigned long allocs = cachep->num_allocations; unsigned long grown = cachep->grown; @@ -4399,31 +4137,8 @@ static int s_show(struct seq_file *m, void *p) allochit, allocmiss, freehit, freemiss); } #endif - seq_putc(m, '\n'); - return 0; } -/* - * slabinfo_op - iterator that generates /proc/slabinfo - * - * Output layout: - * cache-name - * num-active-objs - * total-objs - * object size - * num-active-slabs - * total-slabs - * num-pages-per-slab - * + further values on SMP and with statistics enabled - */ - -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - #define MAX_SLABINFO_WRITE 128 /** * slabinfo_write - Tuning for the slab allocator @@ -4432,7 +4147,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -static ssize_t slabinfo_write(struct file *file, const char __user *buffer, +ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; @@ -4454,9 +4169,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, return -EINVAL; /* Find the cache in the chain of caches. */ - mutex_lock(&cache_chain_mutex); + mutex_lock(&slab_mutex); res = -EINVAL; - list_for_each_entry(cachep, &cache_chain, next) { + list_for_each_entry(cachep, &slab_caches, list) { if (!strcmp(cachep->name, kbuf)) { if (limit < 1 || batchcount < 1 || batchcount > limit || shared < 0) { @@ -4469,31 +4184,18 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer, break; } } - mutex_unlock(&cache_chain_mutex); + mutex_unlock(&slab_mutex); if (res >= 0) res = count; return res; } -static int slabinfo_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slabinfo_op); -} - -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_DEBUG_SLAB_LEAK static void *leaks_start(struct seq_file *m, loff_t *pos) { - mutex_lock(&cache_chain_mutex); - return seq_list_start(&cache_chain, *pos); + mutex_lock(&slab_mutex); + return seq_list_start(&slab_caches, *pos); } static inline int add_caller(unsigned long *n, unsigned long v) @@ -4526,15 +4228,27 @@ static inline int add_caller(unsigned long *n, unsigned long v) return 1; } -static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +static void handle_slab(unsigned long *n, struct kmem_cache *c, + struct page *page) { void *p; - int i; + int i, j; + if (n[0] == n[1]) return; - for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { - if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { + bool active = true; + + for (j = page->active; j < c->num; j++) { + /* Skip freed item */ + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + if (!active) continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) return; } @@ -4558,11 +4272,11 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); - struct slab *slabp; - struct kmem_list3 *l3; + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); + struct page *page; + struct kmem_cache_node *n; const char *name; - unsigned long *n = m->private; + unsigned long *x = m->private; int node; int i; @@ -4573,43 +4287,43 @@ static int leaks_show(struct seq_file *m, void *p) /* OK, we can do it */ - n[1] = 0; + x[1] = 0; for_each_online_node(node) { - l3 = cachep->nodelists[node]; - if (!l3) + n = cachep->node[node]; + if (!n) continue; check_irq_on(); - spin_lock_irq(&l3->list_lock); + spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &l3->slabs_full, list) - handle_slab(n, cachep, slabp); - list_for_each_entry(slabp, &l3->slabs_partial, list) - handle_slab(n, cachep, slabp); - spin_unlock_irq(&l3->list_lock); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); } name = cachep->name; - if (n[0] == n[1]) { + if (x[0] == x[1]) { /* Increase the buffer size */ - mutex_unlock(&cache_chain_mutex); - m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + mutex_unlock(&slab_mutex); + m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); if (!m->private) { /* Too bad, we are really out */ - m->private = n; - mutex_lock(&cache_chain_mutex); + m->private = x; + mutex_lock(&slab_mutex); return -ENOMEM; } - *(unsigned long *)m->private = n[0] * 2; - kfree(n); - mutex_lock(&cache_chain_mutex); + *(unsigned long *)m->private = x[0] * 2; + kfree(x); + mutex_lock(&slab_mutex); /* Now make sure this entry will be retried */ m->count = m->size; return 0; } - for (i = 0; i < n[1]; i++) { - seq_printf(m, "%s: %lu ", name, n[2*i+3]); - show_symbol(m, n[2*i+2]); + for (i = 0; i < x[1]; i++) { + seq_printf(m, "%s: %lu ", name, x[2*i+3]); + show_symbol(m, x[2*i+2]); seq_putc(m, '\n'); } @@ -4618,8 +4332,8 @@ static int leaks_show(struct seq_file *m, void *p) static const struct seq_operations slabstats_op = { .start = leaks_start, - .next = s_next, - .stop = s_stop, + .next = slab_next, + .stop = slab_stop, .show = leaks_show, }; @@ -4650,7 +4364,6 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif @@ -4677,6 +4390,6 @@ size_t ksize(const void *objp) if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return obj_size(virt_to_cache(objp)); + return virt_to_cache(objp)->object_size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab.h b/mm/slab.h new file mode 100644 index 000000000000..6bd4c353704f --- /dev/null +++ b/mm/slab.h @@ -0,0 +1,292 @@ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void create_kmalloc_caches(unsigned long); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +struct mem_cgroup; +#ifdef CONFIG_SLUB +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ return NULL; } +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +int __kmem_cache_shutdown(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +static inline bool is_root_cache(struct kmem_cache *s) +{ + return !s->memcg_params || s->memcg_params->is_root_cache; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ + if (!is_root_cache(s)) + atomic_add(1 << order, &s->memcg_params->nr_pages); +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ + if (is_root_cache(s)) + return; + + if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) + mem_cgroup_destroy_cache(s); +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return (p == s) || + (s->memcg_params && (p == s->memcg_params->root_cache)); +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + return s->memcg_params->root_cache->name; + return s->name; +} + +/* + * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. + * That said the caller must assure the memcg's cache won't go away. Since once + * created a memcg's cache is destroyed only along with the root cache, it is + * true if we are going to allocate from the cache or hold a reference to the + * root cache by other means. Otherwise, we should hold either the slab_mutex + * or the memcg's slab_caches_mutex while calling this function and accessing + * the returned value. + */ +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + struct kmem_cache *cachep; + struct memcg_cache_params *params; + + if (!s->memcg_params) + return NULL; + + rcu_read_lock(); + params = rcu_dereference(s->memcg_params); + cachep = params->memcg_caches[idx]; + rcu_read_unlock(); + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match this (see + * memcg_register_cache()). + */ + smp_read_barrier_depends(); + return cachep; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params->root_cache; +} +#else +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline void memcg_bind_pages(struct kmem_cache *s, int order) +{ +} + +static inline void memcg_release_pages(struct kmem_cache *s, int order) +{ +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} +#endif + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __FUNCTION__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} +#endif + + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); diff --git a/mm/slab_common.c b/mm/slab_common.c new file mode 100644 index 000000000000..102cc6fca3d3 --- /dev/null +++ b/mm/slab_common.c @@ -0,0 +1,748 @@ +/* + * Slab allocator functions that are independent of the allocator strategy + * + * (C) 2012 Christoph Lameter <cl@linux.com> + */ +#include <linux/slab.h> + +#include <linux/mm.h> +#include <linux/poison.h> +#include <linux/interrupt.h> +#include <linux/memory.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/page.h> +#include <linux/memcontrol.h> +#include <trace/events/kmem.h> + +#include "slab.h" + +enum slab_state slab_state; +LIST_HEAD(slab_caches); +DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; + +#ifdef CONFIG_DEBUG_VM +static int kmem_cache_sanity_check(const char *name, size_t size) +{ + struct kmem_cache *s = NULL; + + if (!name || in_interrupt() || size < sizeof(void *) || + size > KMALLOC_MAX_SIZE) { + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; + } + + list_for_each_entry(s, &slab_caches, list) { + char tmp; + int res; + + /* + * This happens when the module gets unloaded and doesn't + * destroy its slab cache and no-one else reuses the vmalloc + * area of the module. Print a warning. + */ + res = probe_kernel_address(s->name, tmp); + if (res) { + pr_err("Slab cache with size %d has lost its name\n", + s->object_size); + continue; + } + +#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) + if (!strcmp(s->name, name)) { + pr_err("%s (%s): Cache name already exists.\n", + __func__, name); + dump_stack(); + s = NULL; + return -EINVAL; + } +#endif + } + + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ + return 0; +} +#else +static inline int kmem_cache_sanity_check(const char *name, size_t size) +{ + return 0; +} +#endif + +#ifdef CONFIG_MEMCG_KMEM +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + ret = memcg_update_cache_size(s, num_memcgs); + /* + * See comment in memcontrol.c, memcg_update_cache_size: + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + goto out; + } + + memcg_update_array_size(num_memcgs); +out: + mutex_unlock(&slab_mutex); + return ret; +} +#endif + +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static struct kmem_cache * +do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = memcg_alloc_cache_params(memcg, s, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); + memcg_register_cache(s); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + memcg_free_cache_params(s); + kfree(s); + goto out; +} + +/* + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s; + char *cache_name; + int err; + + get_online_cpus(); + mutex_lock(&slab_mutex); + + err = kmem_cache_sanity_check(name, size); + if (err) + goto out_unlock; + + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; + + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_unlock; + + cache_name = kstrdup(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; + goto out_unlock; + } + + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree(cache_name); + } + +out_unlock: + mutex_unlock(&slab_mutex); + put_online_cpus(); + + if (err) { + if (flags & SLAB_PANIC) + panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", + name, err); + else { + printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + name, err); + dump_stack(); + } + return NULL; + } + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +#ifdef CONFIG_MEMCG_KMEM +/* + * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + char *cache_name; + + get_online_cpus(); + mutex_lock(&slab_mutex); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) + goto out_unlock; + + cache_name = memcg_create_cache_name(memcg, root_cache); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + if (IS_ERR(s)) { + kfree(cache_name); + goto out_unlock; + } + + s->allocflags |= __GFP_KMEMCG; + +out_unlock: + mutex_unlock(&slab_mutex); + put_online_cpus(); +} + +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + int rc; + + if (!s->memcg_params || + !s->memcg_params->is_root_cache) + return 0; + + mutex_unlock(&slab_mutex); + rc = __kmem_cache_destroy_memcg_children(s); + mutex_lock(&slab_mutex); + + return rc; +} +#else +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + return 0; +} +#endif /* CONFIG_MEMCG_KMEM */ + +void slab_kmem_cache_release(struct kmem_cache *s) +{ + kfree(s->name); + kmem_cache_free(kmem_cache, s); +} + +void kmem_cache_destroy(struct kmem_cache *s) +{ + get_online_cpus(); + mutex_lock(&slab_mutex); + + s->refcount--; + if (s->refcount) + goto out_unlock; + + if (kmem_cache_destroy_memcg_children(s) != 0) + goto out_unlock; + + list_del(&s->list); + memcg_unregister_cache(s); + + if (__kmem_cache_shutdown(s) != 0) { + list_add(&s->list, &slab_caches); + memcg_register_cache(s); + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + goto out_unlock; + } + + mutex_unlock(&slab_mutex); + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + memcg_free_cache_params(s); +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + goto out_put_cpus; + +out_unlock: + mutex_unlock(&slab_mutex); +out_put_cpus: + put_online_cpus(); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, + unsigned long flags) +{ + int err; + + s->name = name; + s->size = s->object_size = size; + s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, + unsigned long flags) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_caches); + +#ifdef CONFIG_ZONE_DMA +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_dma_caches); +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + */ +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) +{ + int index; + + if (unlikely(size > KMALLOC_MAX_SIZE)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[size_index_elem(size)]; + } else + index = fls(size - 1); + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & GFP_DMA))) + return kmalloc_dma_caches[index]; + +#endif + return kmalloc_caches[index]; +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE >= 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + + } + + if (KMALLOC_MIN_SIZE >= 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[i]) { + kmalloc_caches[i] = create_kmalloc_cache(NULL, + 1 << i, flags); + } + + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); + + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + } + + /* Kmalloc array is now usable */ + slab_state = UP; + + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + char *n; + + if (s) { + n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); + + BUG_ON(!n); + s->name = n; + } + } + +#ifdef CONFIG_ZONE_DMA + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s) { + int size = kmalloc_size(i); + char *n = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", size); + + BUG_ON(!n); + kmalloc_dma_caches[i] = create_kmalloc_cache(n, + size, SLAB_CACHE_DMA | flags); + } + } +#endif +} +#endif /* !CONFIG_SLOB */ + +#ifdef CONFIG_TRACING +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +{ + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order_trace); +#endif + +#ifdef CONFIG_SLABINFO + +#ifdef CONFIG_SLAB +#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) +#else +#define SLABINFO_RIGHTS S_IRUSR +#endif + +void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + "<objperslab> <pagesperslab>"); + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " + "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); +#endif + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + mutex_lock(&slab_mutex); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +void *slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +void slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + int i; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +int cache_show(struct kmem_cache *s, struct seq_file *m) +{ + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + memcg_accumulate_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); + return 0; +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (!is_root_cache(s)) + return 0; + return cache_show(s, m); +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = s_start, + .next = slab_next, + .stop = slab_stop, + .show = s_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", SLABINFO_RIGHTS, NULL, + &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ diff --git a/mm/slob.c b/mm/slob.c index 8105be42cad1..730cad45d4be 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -28,9 +28,8 @@ * from kmalloc are prepended with a 4-byte header with the kmalloc size. * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls * alloc_pages() directly, allocating compound pages so the page order - * does not have to be separately tracked, and also stores the exact - * allocation size in page->private so that it can be used to accurately - * provide ksize(). These objects are detected in kfree() because slob_page() + * does not have to be separately tracked. + * These objects are detected in kfree() because PageSlab() * is false for them. * * SLAB is emulated on top of SLOB by simply calling constructors and @@ -59,6 +58,7 @@ #include <linux/kernel.h> #include <linux/slab.h> + #include <linux/mm.h> #include <linux/swap.h> /* struct reclaim_state */ #include <linux/cache.h> @@ -72,6 +72,7 @@ #include <linux/atomic.h> +#include "slab.h" /* * slob_block has a field 'units', which indicates size of block if +ve, * or offset of next block if -ve (in SLOB_UNITs). @@ -92,36 +93,6 @@ struct slob_block { typedef struct slob_block slob_t; /* - * We use struct page fields to manage some slob allocation aspects, - * however to avoid the horrible mess in include/linux/mm_types.h, we'll - * just define our own struct page type variant here. - */ -struct slob_page { - union { - struct { - unsigned long flags; /* mandatory */ - atomic_t _count; /* mandatory */ - slobidx_t units; /* free units left in page */ - unsigned long pad[2]; - slob_t *free; /* first free slob_t in page */ - struct list_head list; /* linked list of free pages */ - }; - struct page page; - }; -}; -static inline void struct_slob_page_wrong_size(void) -{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } - -/* - * free_slob_page: call before a slob_page is returned to the page allocator. - */ -static inline void free_slob_page(struct slob_page *sp) -{ - reset_page_mapcount(&sp->page); - sp->page.mapping = NULL; -} - -/* * All partially free slob pages go on these lists. */ #define SLOB_BREAK1 256 @@ -131,51 +102,27 @@ static LIST_HEAD(free_slob_medium); static LIST_HEAD(free_slob_large); /* - * is_slob_page: True for all slob pages (false for bigblock pages) - */ -static inline int is_slob_page(struct slob_page *sp) -{ - return PageSlab((struct page *)sp); -} - -static inline void set_slob_page(struct slob_page *sp) -{ - __SetPageSlab((struct page *)sp); -} - -static inline void clear_slob_page(struct slob_page *sp) -{ - __ClearPageSlab((struct page *)sp); -} - -static inline struct slob_page *slob_page(const void *addr) -{ - return (struct slob_page *)virt_to_page(addr); -} - -/* * slob_page_free: true for pages on free_slob_pages list. */ -static inline int slob_page_free(struct slob_page *sp) +static inline int slob_page_free(struct page *sp) { - return PageSlobFree((struct page *)sp); + return PageSlobFree(sp); } -static void set_slob_page_free(struct slob_page *sp, struct list_head *list) +static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->list, list); - __SetPageSlobFree((struct page *)sp); + list_add(&sp->lru, list); + __SetPageSlobFree(sp); } -static inline void clear_slob_page_free(struct slob_page *sp) +static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->list); - __ClearPageSlobFree((struct page *)sp); + list_del(&sp->lru); + __ClearPageSlobFree(sp); } #define SLOB_UNIT sizeof(slob_t) -#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) -#define SLOB_ALIGN L1_CACHE_BYTES +#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which @@ -245,7 +192,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) void *page; #ifdef CONFIG_NUMA - if (node != -1) + if (node != NUMA_NO_NODE) page = alloc_pages_exact_node(node, gfp, order); else #endif @@ -267,12 +214,12 @@ static void slob_free_pages(void *b, int order) /* * Allocate a slob block within a given slob_page sp. */ -static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) +static void *slob_page_alloc(struct page *sp, size_t size, int align) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); - for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { + for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); if (align) { @@ -296,12 +243,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) if (prev) set_slob(prev, slob_units(prev), next); else - sp->free = next; + sp->freelist = next; } else { /* fragment */ if (prev) set_slob(prev, slob_units(prev), cur + units); else - sp->free = cur + units; + sp->freelist = cur + units; set_slob(cur + units, avail - units, next); } @@ -320,7 +267,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) */ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { - struct slob_page *sp; + struct page *sp; struct list_head *prev; struct list_head *slob_list; slob_t *b = NULL; @@ -335,13 +282,13 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, list) { + list_for_each_entry(sp, slob_list, lru) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial * page with a matching node id in the freelist. */ - if (node != -1 && page_to_nid(&sp->page) != node) + if (node != NUMA_NO_NODE && page_to_nid(sp) != node) continue; #endif /* Enough room on this page? */ @@ -349,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) continue; /* Attempt to alloc */ - prev = sp->list.prev; + prev = sp->lru.prev; b = slob_page_alloc(sp, size, align); if (!b) continue; @@ -369,13 +316,13 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); if (!b) return NULL; - sp = slob_page(b); - set_slob_page(sp); + sp = virt_to_page(b); + __SetPageSlab(sp); spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); - sp->free = b; - INIT_LIST_HEAD(&sp->list); + sp->freelist = b; + INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); @@ -392,7 +339,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) */ static void slob_free(void *block, int size) { - struct slob_page *sp; + struct page *sp; slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; @@ -402,7 +349,7 @@ static void slob_free(void *block, int size) return; BUG_ON(!size); - sp = slob_page(block); + sp = virt_to_page(block); units = SLOB_UNITS(size); spin_lock_irqsave(&slob_lock, flags); @@ -412,8 +359,8 @@ static void slob_free(void *block, int size) if (slob_page_free(sp)) clear_slob_page_free(sp); spin_unlock_irqrestore(&slob_lock, flags); - clear_slob_page(sp); - free_slob_page(sp); + __ClearPageSlab(sp); + page_mapcount_reset(sp); slob_free_pages(b, 0); return; } @@ -421,7 +368,7 @@ static void slob_free(void *block, int size) if (!slob_page_free(sp)) { /* This slob page is about to become partially free. Easy! */ sp->units = units; - sp->free = b; + sp->freelist = b; set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); @@ -441,15 +388,15 @@ static void slob_free(void *block, int size) */ sp->units += units; - if (b < sp->free) { - if (b + units == sp->free) { - units += slob_units(sp->free); - sp->free = slob_next(sp->free); + if (b < (slob_t *)sp->freelist) { + if (b + units == sp->freelist) { + units += slob_units(sp->freelist); + sp->freelist = slob_next(sp->freelist); } - set_slob(b, units, sp->free); - sp->free = b; + set_slob(b, units, sp->freelist); + sp->freelist = b; } else { - prev = sp->free; + prev = sp->freelist; next = slob_next(prev); while (b > next) { prev = next; @@ -476,10 +423,11 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -void *__kmalloc_node(size_t size, gfp_t gfp, int node) +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -497,7 +445,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) *m = size; ret = (void *)m + align; - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, size + align, gfp, node); } else { unsigned int order = get_order(size); @@ -505,24 +453,39 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) if (likely(order)) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - if (ret) { - struct page *page; - page = virt_to_page(ret); - page->private = size; - } - trace_kmalloc_node(_RET_IP_, ret, + trace_kmalloc_node(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); return ret; } -EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t gfp) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_TRACING +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, node, caller); +} +#endif +#endif void kfree(const void *block) { - struct slob_page *sp; + struct page *sp; trace_kfree(_RET_IP_, block); @@ -530,83 +493,48 @@ void kfree(const void *block) return; kmemleak_free(block); - sp = slob_page(block); - if (is_slob_page(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + sp = virt_to_page(block); + if (PageSlab(sp)) { + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); } else - put_page(&sp->page); + __free_pages(sp, compound_order(sp)); } EXPORT_SYMBOL(kfree); /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t ksize(const void *block) { - struct slob_page *sp; + struct page *sp; + int align; + unsigned int *m; BUG_ON(!block); if (unlikely(block == ZERO_SIZE_PTR)) return 0; - sp = slob_page(block); - if (is_slob_page(sp)) { - int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); - unsigned int *m = (unsigned int *)(block - align); - return SLOB_UNITS(*m) * SLOB_UNIT; - } else - return sp->page.private; -} -EXPORT_SYMBOL(ksize); - -struct kmem_cache { - unsigned int size, align; - unsigned long flags; - const char *name; - void (*ctor)(void *); -}; - -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) -{ - struct kmem_cache *c; - - c = slob_alloc(sizeof(struct kmem_cache), - GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); + sp = virt_to_page(block); + if (unlikely(!PageSlab(sp))) + return PAGE_SIZE << compound_order(sp); - if (c) { - c->name = name; - c->size = size; - if (flags & SLAB_DESTROY_BY_RCU) { - /* leave room for rcu footer at the end of object */ - c->size += sizeof(struct slob_rcu); - } - c->flags = flags; - c->ctor = ctor; - /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; - if (c->align < ARCH_SLAB_MINALIGN) - c->align = ARCH_SLAB_MINALIGN; - if (c->align < align) - c->align = align; - } else if (flags & SLAB_PANIC) - panic("Cannot create slab cache %s\n", name); - - kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); - return c; + align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(kmem_cache_create); +EXPORT_SYMBOL(ksize); -void kmem_cache_destroy(struct kmem_cache *c) +int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - kmemleak_free(c); - if (c->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - slob_free(c, sizeof(struct kmem_cache)); + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); + } + c->flags = flags; + return 0; } -EXPORT_SYMBOL(kmem_cache_destroy); -void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; @@ -616,23 +544,43 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, PAGE_SIZE << get_order(c->size), flags, node); } - if (c->ctor) + if (b && c->ctor) c->ctor(b); kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } +EXPORT_SYMBOL(slob_alloc_node); + +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return slob_alloc_node(cachep, flags, NUMA_NO_NODE); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + return __do_kmalloc_node(size, gfp, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) +{ + return slob_alloc_node(cachep, gfp, node); +} EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif static void __kmem_cache_free(void *b, int size) { @@ -666,11 +614,11 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); -unsigned int kmem_cache_size(struct kmem_cache *c) +int __kmem_cache_shutdown(struct kmem_cache *c) { - return c->size; + /* No way to check for remaining objects */ + return 0; } -EXPORT_SYMBOL(kmem_cache_size); int kmem_cache_shrink(struct kmem_cache *d) { @@ -678,19 +626,20 @@ int kmem_cache_shrink(struct kmem_cache *d) } EXPORT_SYMBOL(kmem_cache_shrink); -static unsigned int slob_ready __read_mostly; - -int slab_is_available(void) -{ - return slob_ready; -} +struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; void __init kmem_cache_init(void) { - slob_ready = 1; + kmem_cache = &kmem_cache_boot; + slab_state = UP; } void __init kmem_cache_init_late(void) { - /* Nothing to do */ + slab_state = FULL; } diff --git a/mm/slub.c b/mm/slub.c index 80848cd3901c..2b1ce697fc4b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,7 +16,9 @@ #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/slab.h> +#include "slab.h" #include <linux/proc_fs.h> +#include <linux/notifier.h> #include <linux/seq_file.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> @@ -30,18 +32,21 @@ #include <linux/fault-inject.h> #include <linux/stacktrace.h> #include <linux/prefetch.h> +#include <linux/memcontrol.h> #include <trace/events/kmem.h> +#include "internal.h" + /* * Lock order: - * 1. slub_lock (Global Semaphore) + * 1. slab_mutex (Global Mutex) * 2. node->list_lock * 3. slab_lock(page) (Only on some arches and for debugging) * - * slub_lock + * slab_mutex * - * The role of the slub_lock is to protect the list of all the slabs + * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not @@ -109,9 +114,6 @@ * the fast path and disables lockless freelists. */ -#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) - static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG @@ -121,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif +} + /* * Issues still to be resolved: * @@ -144,7 +155,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -176,23 +187,10 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -static int kmem_size = sizeof(struct kmem_cache); - #ifdef CONFIG_SMP static struct notifier_block slab_notifier; #endif -static enum { - DOWN, /* No slab functionality available */ - PARTIAL, /* Kmem_cache_node works */ - UP, /* Everything works but does not show up in sysfs */ - SYSFS /* Sysfs up */ -} slab_state = DOWN; - -/* A list of all slab caches on the system */ -static DECLARE_RWSEM(slub_lock); -static LIST_HEAD(slab_caches); - /* * Tracking user of a slab. */ @@ -212,24 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); - +static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) -{ - kfree(s->name); - kfree(s); -} - +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -237,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -int slab_is_available(void) -{ - return slab_state >= UP; -} - static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { return s->node[node]; @@ -311,7 +302,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; + return s->object_size; #endif /* @@ -365,6 +356,21 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -383,9 +389,10 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page #endif { slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { + if (page->freelist == freelist_old && + page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); return 1; } @@ -421,9 +428,10 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, local_irq_save(flags); slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { + if (page->freelist == freelist_old && + page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); return 1; @@ -563,8 +571,9 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + printk(KERN_ERR + "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -581,6 +590,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) @@ -609,11 +620,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->objsize, + print_section("Object ", p, min_t(unsigned long, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->objsize, - s->inuse - s->objsize); + print_section("Redzone ", p + s->object_size, + s->inuse - s->object_size); if (s->offset) off = s->offset + sizeof(void *); @@ -637,7 +648,8 @@ static void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, + const char *fmt, ...) { va_list args; char buf[100]; @@ -655,12 +667,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) u8 *p = object; if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize - 1] = POISON_END; + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, val, s->inuse - s->objsize); + memset(p + s->object_size, val, s->inuse - s->object_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -705,10 +717,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) * - * object + s->objsize + * object + s->object_size * Padding to reach word boundary. This is also used for Redzoning. * Padding is extended by another word if Redzoning is enabled and - * objsize == inuse. + * object_size == inuse. * * We fill with 0xbb (RED_INACTIVE) for inactive objects and with * 0xcc (RED_ACTIVE) for objects in use. @@ -727,7 +739,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object + s->size * Nothing is used beyond s->size. * - * If slabcaches are merged then the objsize and inuse boundaries are mostly + * If slabcaches are merged then the object_size and inuse boundaries are mostly * ignored. And therefore no slab options that rely on these boundaries * may be used with merged slabcaches. */ @@ -787,25 +799,26 @@ static int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { u8 *p = object; - u8 *endobject = object + s->objsize; + u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, val, s->inuse - s->objsize)) + endobject, val, s->inuse - s->object_size)) return 0; } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->objsize); + endobject, POISON_INUSE, + s->inuse - s->object_size); } } if (s->flags & SLAB_POISON) { if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, - POISON_FREE, s->objsize - 1) || + POISON_FREE, s->object_size - 1) || !check_bytes_and_report(s, page, p, "Poison", - p + s->objsize - 1, POISON_END, 1))) + p + s->object_size - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. @@ -881,7 +894,6 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) object_err(s, page, object, "Freechain corrupt"); set_freepointer(s, object, NULL); - break; } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; @@ -926,7 +938,8 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object ", (void *)object, s->objsize); + print_section("Object ", (void *)object, + s->object_size); dump_stack(); } @@ -936,20 +949,31 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); might_sleep_if(flags & __GFP_WAIT); - return should_failslab(s->objsize, flags, s->flags); + return should_failslab(s->object_size, flags, s->flags); } -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { flags &= gfp_allowed_mask; kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -957,7 +981,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kmemleak_free_recursive(x, s->flags); /* - * Trouble is that we may no longer disable interupts in the fast path + * Trouble is that we may no longer disable interrupts in the fast path * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ @@ -966,19 +990,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) unsigned long flags; local_irq_save(flags); - kmemcheck_slab_free(s, x, s->objsize); - debug_check_no_locks_freed(x, s->objsize); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); local_irq_restore(flags); } #endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->objsize); + debug_check_no_obj_freed(x, s->object_size); } /* * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held. */ static void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -986,17 +1008,16 @@ static void add_full(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_add(&page->lru, &n->full); } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_del(&page->lru); } @@ -1023,7 +1044,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (n) { + if (likely(n)) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -1047,7 +1068,8 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -1082,13 +1104,13 @@ bad: return 0; } -static noinline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { - unsigned long flags; - int rc = 0; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - local_irq_save(flags); + spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); if (!check_slab(s, page)) @@ -1107,11 +1129,11 @@ static noinline int free_debug_processing(struct kmem_cache *s, if (!check_object(s, page, object, SLUB_RED_ACTIVE)) goto out; - if (unlikely(s != page->slab)) { + if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); - } else if (!page->slab) { + } else if (!page->slab_cache) { printk(KERN_ERR "SLUB <none>: no slab for object 0x%p.\n", object); @@ -1126,15 +1148,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - rc = 1; out: slab_unlock(page); - local_irq_restore(flags); - return rc; + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; fail: + slab_unlock(page); + spin_unlock_irqrestore(&n->list_lock, *flags); slab_fix(s, "Object at 0x%p not freed", object); - goto out; + return NULL; } static int __init setup_slub_debug(char *str) @@ -1207,15 +1233,15 @@ out: __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long objsize, +static unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; return flags; @@ -1227,8 +1253,9 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1236,8 +1263,9 @@ static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long objsize, +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1256,13 +1284,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} #endif /* CONFIG_SLUB_DEBUG */ @@ -1304,27 +1349,22 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); + page = alloc_slab_page(alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); } - if (flags & __GFP_WAIT) - local_irq_disable(); - - if (!page) - return NULL; - - if (kmemcheck_enabled + if (kmemcheck_enabled && page && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get @@ -1336,6 +1376,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? @@ -1359,6 +1404,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *start; void *last; void *p; + int order; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1367,14 +1413,18 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) goto out; + order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - page->slab = s; - page->flags |= 1 << PG_slab; + memcg_bind_pages(s, order); + page->slab_cache = s; + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page)); + memset(start, POISON_INUSE, PAGE_SIZE << order); last = start; for_each_object(p, s, start, page->objects) { @@ -1413,11 +1463,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - reset_page_mapcount(page); + + memcg_release_pages(s, order); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); + __free_memcg_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -1432,7 +1485,7 @@ static void rcu_free_slab(struct rcu_head *h) else page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + __free_slab(page->slab_cache, page); } static void free_slab(struct kmem_cache *s, struct page *page) @@ -1466,11 +1519,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page) /* * Management of partially allocated slabs. - * - * list_lock must be held. */ -static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) @@ -1479,66 +1530,86 @@ static inline void add_partial(struct kmem_cache_node *n, list_add(&page->lru, &n->partial); } -/* - * list_lock must be held. - */ -static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); +} + /* - * Lock slab, remove from the partial list and put the object into the - * per cpu freelist. + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. * * Returns a list of objects or NULL if it fails. - * - * Must hold list_lock. */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, - int mode) + int mode, int *objects) { void *freelist; unsigned long counters; struct page new; + lockdep_assert_held(&n->list_lock); + /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the * per cpu allocation list. */ - do { - freelist = page->freelist; - counters = page->counters; - new.counters = counters; - if (mode) - new.inuse = page->objects; + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + *objects = new.objects - new.inuse; + if (mode) { + new.inuse = page->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; + } - VM_BUG_ON(new.frozen); - new.frozen = 1; + VM_BUG_ON(new.frozen); + new.frozen = 1; - } while (!__cmpxchg_double_slab(s, page, + if (!__cmpxchg_double_slab(s, page, freelist, counters, - NULL, new.counters, - "lock and freeze")); + new.freelist, new.counters, + "acquire_slab")) + return NULL; remove_partial(n, page); + WARN_ON(!freelist); return freelist; } -static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, struct kmem_cache_cpu *c) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { struct page *page, *page2; void *object = NULL; + int available = 0; + int objects; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1551,24 +1622,26 @@ static void *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t = acquire_slab(s, n, page, object == NULL); - int available; + void *t; + + if (!pfmemalloc_match(page, flags)) + continue; + t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) break; + available += objects; if (!object) { c->page = page; - c->node = page_to_nid(page); stat(s, ALLOC_FROM_PARTIAL); object = t; - available = page->objects - page->inuse; } else { - page->freelist = t; - available = put_cpu_partial(s, page, 0); + put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } - if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) break; } @@ -1579,7 +1652,7 @@ static void *get_partial_node(struct kmem_cache *s, /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA @@ -1613,8 +1686,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1622,22 +1695,20 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); + object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1651,7 +1722,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - object = get_partial_node(s, get_node(s, searchnode), c); + object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) return object; @@ -1717,7 +1788,7 @@ static inline void note_cmpxchg_failure(const char *n, stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } -void init_kmem_cache_cpus(struct kmem_cache *s) +static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; @@ -1728,14 +1799,13 @@ void init_kmem_cache_cpus(struct kmem_cache *s) /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; - struct page *page = c->page; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); int lock = 0; enum slab_modes l = M_NONE, m = M_NONE; - void *freelist; void *nextfree; int tail = DEACTIVATE_TO_HEAD; struct page new; @@ -1746,11 +1816,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) tail = DEACTIVATE_TO_TAIL; } - c->tid = next_tid(c->tid); - c->page = NULL; - freelist = c->freelist; - c->freelist = NULL; - /* * Stage one: Free all available per cpu objects back * to the page freelist while it is still frozen. Leave the @@ -1844,7 +1909,7 @@ redo: else if (l == M_FULL) - remove_full(s, page); + remove_full(s, n, page); if (m == M_PARTIAL) { @@ -1876,21 +1941,34 @@ redo: } } -/* Unfreeze all the cpu partial slabs */ -static void unfreeze_partials(struct kmem_cache *s) +/* + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). + */ +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) { - struct kmem_cache_node *n = NULL; - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; while ((page = c->partial)) { - enum slab_modes { M_PARTIAL, M_FREE }; - enum slab_modes l, m; struct page new; struct page old; c->partial = page->next; - l = M_FREE; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } do { @@ -1903,43 +1981,17 @@ static void unfreeze_partials(struct kmem_cache *s) new.frozen = 0; - if (!new.inuse && (!n || n->nr_partial > s->min_partial)) - m = M_FREE; - else { - struct kmem_cache_node *n2 = get_node(s, - page_to_nid(page)); - - m = M_PARTIAL; - if (n != n2) { - if (n) - spin_unlock(&n->list_lock); - - n = n2; - spin_lock(&n->list_lock); - } - } - - if (l != m) { - if (l == M_PARTIAL) { - remove_partial(n, page); - stat(s, FREE_REMOVE_PARTIAL); - } else { - add_partial(n, page, - DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - - l = m; - } - - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")); - if (m == M_FREE) { + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { page->next = discard_page; discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } } @@ -1954,6 +2006,7 @@ static void unfreeze_partials(struct kmem_cache *s) discard_slab(s, page); stat(s, FREE_SLAB); } +#endif } /* @@ -1965,8 +2018,9 @@ static void unfreeze_partials(struct kmem_cache *s) * If we did not find a slot then simply move all the partials to the * per node partial list. */ -int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { +#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; int pages; int pobjects; @@ -1986,8 +2040,9 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) * set to the per node partial list. */ local_irq_save(flags); - unfreeze_partials(s); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); local_irq_restore(flags); + oldpage = NULL; pobjects = 0; pages = 0; stat(s, CPU_PARTIAL_DRAIN); @@ -2001,14 +2056,19 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->pobjects = pobjects; page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - return pobjects; + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) + != oldpage); +#endif } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c); + deactivate_slab(s, c->page, c->freelist); + + c->tid = next_tid(c->tid); + c->page = NULL; + c->freelist = NULL; } /* @@ -2024,7 +2084,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) if (c->page) flush_slab(s, c); - unfreeze_partials(s); + unfreeze_partials(s, c); } } @@ -2052,10 +2112,10 @@ static void flush_all(struct kmem_cache *s) * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(struct page *page, int node) { #ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && c->node != node) + if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) return 0; #endif return 1; @@ -2098,10 +2158,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->objsize, + "default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); - if (oo_order(s->min) > get_order(s->objsize)) + if (oo_order(s->min) > get_order(s->object_size)) printk(KERN_WARNING " %s debugging increased min order, use " "slub_debug=O to disable.\n", s->name); @@ -2127,10 +2187,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu **pc) { - void *object; - struct kmem_cache_cpu *c; - struct page *page = new_slab(s, flags, node); + void *freelist; + struct kmem_cache_cpu *c = *pc; + struct page *page; + freelist = get_partial(s, flags, node, c); + + if (freelist) + return freelist; + + page = new_slab(s, flags, node); if (page) { c = __this_cpu_ptr(s->cpu_slab); if (c->page) @@ -2140,26 +2206,35 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, * No other reference to the page yet so we can * muck around with it freely without cmpxchg */ - object = page->freelist; + freelist = page->freelist; page->freelist = NULL; stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); c->page = page; *pc = c; } else - object = NULL; + freelist = NULL; - return object; + return freelist; +} + +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; } /* - * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist - * or deactivate the page. + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. * * The page is still frozen if the return value is not NULL. * * If this function returns NULL then the page has been unfrozen. + * + * This function must be called with interrupt disabled. */ static inline void *get_freelist(struct kmem_cache *s, struct page *page) { @@ -2170,13 +2245,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) do { freelist = page->freelist; counters = page->counters; + new.counters = counters; VM_BUG_ON(!new.frozen); new.inuse = page->objects; new.frozen = freelist != NULL; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, freelist, counters, NULL, new.counters, "get_freelist")); @@ -2203,7 +2279,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { - void **object; + void *freelist; + struct page *page; unsigned long flags; local_irq_save(flags); @@ -2216,25 +2293,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - if (!c->page) + page = c->page; + if (!page) goto new_slab; redo: - if (unlikely(!node_match(c, node))) { + + if (unlikely(!node_match(page, node))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, c); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } + + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; goto new_slab; } /* must check again c->freelist in case of cpu migration or IRQ */ - object = c->freelist; - if (object) + freelist = c->freelist; + if (freelist) goto load_freelist; stat(s, ALLOC_SLOWPATH); - object = get_freelist(s, c->page); + freelist = get_freelist(s, page); - if (!object) { + if (!freelist) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2243,50 +2336,51 @@ redo: stat(s, ALLOC_REFILL); load_freelist: - c->freelist = get_freepointer(s, object); + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. + * That page must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); local_irq_restore(flags); - return object; + return freelist; new_slab: if (c->partial) { - c->page = c->partial; - c->partial = c->page->next; - c->node = page_to_nid(c->page); + page = c->page = c->partial; + c->partial = page->next; stat(s, CPU_PARTIAL_ALLOC); c->freelist = NULL; goto redo; } - /* Then do expensive stuff like retrieving pages from the partial lists */ - object = get_partial(s, gfpflags, node, c); - - if (unlikely(!object)) { + freelist = new_slab_objects(s, gfpflags, node, &c); - object = new_slab_objects(s, gfpflags, node, &c); + if (unlikely(!freelist)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - if (unlikely(!object)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - - local_irq_restore(flags); - return NULL; - } + local_irq_restore(flags); + return NULL; } - if (likely(!kmem_cache_debug(s))) + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) goto load_freelist; /* Only entered in the debug case */ - if (!alloc_debug_processing(s, c->page, object, addr)) + if (kmem_cache_debug(s) && + !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - c->freelist = get_freepointer(s, object); - deactivate_slab(s, c); - c->node = NUMA_NO_NODE; + deactivate_slab(s, page, get_freepointer(s, freelist)); + c->page = NULL; + c->freelist = NULL; local_irq_restore(flags); - return object; + return freelist; } /* @@ -2299,24 +2393,31 @@ new_slab: * * Otherwise we can simply pick the next object from the lockless free list. */ -static __always_inline void *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; + struct page *page; unsigned long tid; if (slab_pre_alloc_hook(s, gfpflags)) return NULL; + s = memcg_kmem_get_cache(s, gfpflags); redo: - /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is * enabled. We may switch back and forth between cpus while * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. + * + * Preemption is disabled for the retrieval of the tid because that + * must occur from the current processor. We cannot allow rescheduling + * on a different processor between the determination of the pointer + * and the retrieval of the tid. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); /* @@ -2326,11 +2427,11 @@ redo: * linked list in between. */ tid = c->tid; - barrier(); + preempt_enable(); object = c->freelist; - if (unlikely(!object || !node_match(c, node))) - + page = c->page; + if (unlikely(!object || !node_match(page, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { @@ -2340,13 +2441,15 @@ redo: * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. * - * The cmpxchg does the following atomically (without lock semantics!) + * The cmpxchg does the following atomically (without lock + * semantics!) * 1. Relocate first pointer to the current per cpu area. * 2. Verify that tid and freelist have not been changed * 3. If they were not changed replace tid and freelist * - * Since this is without lock semantics the protection is only against - * code executing on this cpu *not* from access by other cpus. + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. */ if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, @@ -2361,18 +2464,25 @@ redo: } if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, s->objsize); + memset(object, 0, s->object_size); slab_post_alloc_hook(s, gfpflags, object); return object; } +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + s->size, gfpflags); return ret; } @@ -2381,28 +2491,20 @@ EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); - -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) -{ - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); - return ret; -} -EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, - s->objsize, s->size, gfpflags, node); + s->object_size, s->size, gfpflags, node); return ret; } @@ -2413,7 +2515,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { - void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); @@ -2437,7 +2539,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, void *prior; void **object = (void *)x; int was_frozen; - int inuse; struct page new; unsigned long counters; struct kmem_cache_node *n = NULL; @@ -2445,27 +2546,34 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); - if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) return; do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } prior = page->freelist; counters = page->counters; set_freepointer(s, object, prior); new.counters = counters; was_frozen = new.frozen; new.inuse--; - if ((!new.inuse || !prior) && !was_frozen && !n) { + if ((!new.inuse || !prior) && !was_frozen) { - if (!kmem_cache_debug(s) && !prior) + if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * Slab was on no list before and will be partially empty - * We can defer the list move and instead freeze it. + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. */ new.frozen = 1; - else { /* Needs to be taken off a list */ + } else { /* Needs to be taken off a list */ n = get_node(s, page_to_nid(page)); /* @@ -2480,7 +2588,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } } - inuse = new.inuse; } while (!cmpxchg_double_slab(s, page, prior, counters, @@ -2506,25 +2613,18 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } + if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + goto slab_empty; + /* - * was_frozen may have been set after we acquired the list_lock in - * an earlier loop. So we need to check it here again. + * Objects left in the slab. If it was not on the partial list before + * then add it. */ - if (was_frozen) - stat(s, FREE_FROZEN); - else { - if (unlikely(!inuse && n->nr_partial > s->min_partial)) - goto slab_empty; - - /* - * Objects left in the slab. If it was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) { - remove_full(s, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, n, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return; @@ -2536,9 +2636,10 @@ slab_empty: */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } else + } else { /* Slab must be on the full list */ - remove_full(s, page); + remove_full(s, n, page); + } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); @@ -2572,10 +2673,11 @@ redo: * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */ + preempt_disable(); c = __this_cpu_ptr(s->cpu_slab); tid = c->tid; - barrier(); + preempt_enable(); if (likely(page == c->page)) { set_freepointer(s, object, c->freelist); @@ -2596,12 +2698,10 @@ redo: void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - slab_free(s, page, x, _RET_IP_); - + s = cache_from_obj(s, x); + if (!s) + return; + slab_free(s, virt_to_head_page(x), x, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2739,34 +2839,8 @@ static inline int calculate_order(int size, int reserved) return -ENOSYS; } -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static void -init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -2781,7 +2855,7 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -2805,8 +2879,8 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) @@ -2836,10 +2910,14 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmem_cache_node); + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, DEACTIVATE_TO_HEAD); + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2876,7 +2954,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) } s->node[node] = n; - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); } return 1; } @@ -2897,8 +2975,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; - unsigned long size = s->objsize; - unsigned long align = s->align; + unsigned long size = s->object_size; int order; /* @@ -2926,7 +3003,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * end of the object and the free pointer. If not then add an * additional word to have some bytes to store Redzone information. */ - if ((flags & SLAB_RED_ZONE) && size == s->objsize) + if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); #endif @@ -2970,19 +3047,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) #endif /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. - */ - align = calculate_alignment(flags, align, s->objsize); - s->align = align; - - /* * SLUB stores one object immediately after another beginning from * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, align); + size = ALIGN(size, s->align); s->size = size; if (forced_order >= 0) order = forced_order; @@ -2997,7 +3066,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) - s->allocflags |= SLUB_DMA; + s->allocflags |= GFP_DMA; if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -3011,20 +3080,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) s->max = s->oo; return !!oo_objects(s->oo); - } -static int kmem_cache_open(struct kmem_cache *s, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *)) +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->objsize = size; - s->align = align; - s->flags = kmem_cache_flags(size, flags, name, ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) @@ -3037,7 +3097,7 @@ static int kmem_cache_open(struct kmem_cache *s, * Disable debugging flags that store metadata if the min slab * order increased. */ - if (get_order(s->size) > get_order(s->objsize)) { + if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; if (!calculate_sizes(s, -1)) @@ -3072,10 +3132,10 @@ static int kmem_cache_open(struct kmem_cache *s, * A) The number of objects from per cpu partial slabs dumped to the * per node list when we reach the limit. * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch 50% - * to keep some capacity around for frees. + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. */ - if (kmem_cache_debug(s)) + if (!kmem_cache_has_cpu_partial(s)) s->cpu_partial = 0; else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; @@ -3086,7 +3146,6 @@ static int kmem_cache_open(struct kmem_cache *s, else s->cpu_partial = 30; - s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -3094,27 +3153,18 @@ static int kmem_cache_open(struct kmem_cache *s, goto error; if (alloc_kmem_cache_cpus(s)) - return 1; + return 0; free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, oo_order(s->oo), - s->offset, flags); - return 0; + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); + return -EINVAL; } -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->objsize; -} -EXPORT_SYMBOL(kmem_cache_size); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -3125,7 +3175,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, sizeof(long), GFP_ATOMIC); if (!map) return; - slab_err(s, page, "%s", text); + slab_err(s, page, text, s->name); slab_lock(page); get_map(s, page, map); @@ -3153,11 +3203,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - remove_partial(n, page); + __remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining on kmem_cache_close()"); + "Objects remaining in %s on kmem_cache_close()"); } } } @@ -3170,7 +3220,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - free_percpu(s->cpu_slab); /* Attempt to free all objects */ for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3179,47 +3228,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } + free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); return 0; } -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) -{ - down_write(&slub_lock); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - up_write(&slub_lock); - if (kmem_cache_close(s)) { - printk(KERN_ERR "SLUB %s: %s called for cache that " - "still has objects.\n", s->name, __func__); - dump_stack(); - } - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - sysfs_slab_remove(s); - } else - up_write(&slub_lock); +int __kmem_cache_shutdown(struct kmem_cache *s) +{ + return kmem_cache_close(s); } -EXPORT_SYMBOL(kmem_cache_destroy); /******************************************************************** * Kmalloc subsystem *******************************************************************/ -struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; -EXPORT_SYMBOL(kmalloc_caches); - -static struct kmem_cache *kmem_cache; - -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -3256,101 +3278,20 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *__init create_kmalloc_cache(const char *name, - int size, unsigned int flags) -{ - struct kmem_cache *s; - - s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - - /* - * This function is called with IRQs disabled during early-boot on - * single CPU so there's no need to take slub_lock here. - */ - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL)) - goto panic; - - list_add(&s->list, &slab_caches); - return s; - -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); - return NULL; -} - -/* - * Conversion table for small slabs sizes / 8 to the index in the - * kmalloc array. This is necessary for slabs < 192 since we have non power - * of two cache sizes there. The size of larger slabs can be determined using - * fls. - */ -static s8 size_index[24] = { - 3, /* 8 */ - 4, /* 16 */ - 5, /* 24 */ - 5, /* 32 */ - 6, /* 40 */ - 6, /* 48 */ - 6, /* 56 */ - 6, /* 64 */ - 1, /* 72 */ - 1, /* 80 */ - 1, /* 88 */ - 1, /* 96 */ - 7, /* 104 */ - 7, /* 112 */ - 7, /* 120 */ - 7, /* 128 */ - 2, /* 136 */ - 2, /* 144 */ - 2, /* 152 */ - 2, /* 160 */ - 2, /* 168 */ - 2, /* 176 */ - 2, /* 184 */ - 2 /* 192 */ -}; - -static inline int size_index_elem(size_t bytes) -{ - return (bytes - 1) / 8; -} - -static struct kmem_cache *get_slab(size_t size, gfp_t flags) -{ - int index; - - if (size <= 192) { - if (!size) - return ZERO_SIZE_PTR; - - index = size_index[size_index_elem(size)]; - } else - index = fls(size - 1); - -#ifdef CONFIG_ZONE_DMA - if (unlikely((flags & SLUB_DMA))) - return kmalloc_dma_caches[index]; - -#endif - return kmalloc_caches[index]; -} - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, flags); - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); + ret = slab_alloc(s, flags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -3364,12 +3305,12 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -3378,7 +3319,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, flags, node); trace_kmalloc_node(_RET_IP_, ret, @@ -3388,12 +3329,12 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } - s = get_slab(size, flags); + s = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc_node(s, flags, node, _RET_IP_); trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); @@ -3416,46 +3357,10 @@ size_t ksize(const void *object) return PAGE_SIZE << compound_order(page); } - return slab_ksize(page->slab); + return slab_ksize(page->slab_cache); } EXPORT_SYMBOL(ksize); -#ifdef CONFIG_SLUB_DEBUG -bool verify_mem_not_deleted(const void *x) -{ - struct page *page; - void *object = (void *)x; - unsigned long flags; - bool rv; - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return false; - - local_irq_save(flags); - - page = virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - /* maybe it was from stack? */ - rv = true; - goto out_unlock; - } - - slab_lock(page); - if (on_freelist(page->slab, page, object)) { - object_err(page->slab, page, object, "Object is on free-list"); - rv = false; - } else { - rv = true; - } - slab_unlock(page); - -out_unlock: - local_irq_restore(flags); - return rv; -} -EXPORT_SYMBOL(verify_mem_not_deleted); -#endif - void kfree(const void *x) { struct page *page; @@ -3469,11 +3374,11 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); - put_page(page); + kfree_hook(x); + __free_memcg_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3545,15 +3450,14 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) kmem_cache_shrink(s); - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return 0; } @@ -3565,7 +3469,7 @@ static void slab_mem_offline_callback(void *arg) struct memory_notify *marg = arg; int offline_node; - offline_node = marg->status_change_nid; + offline_node = marg->status_change_nid_normal; /* * If the node still has available memory. we need kmem_cache_node @@ -3574,7 +3478,7 @@ static void slab_mem_offline_callback(void *arg) if (offline_node < 0) return; - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { n = get_node(s, offline_node); if (n) { @@ -3590,7 +3494,7 @@ static void slab_mem_offline_callback(void *arg) kmem_cache_free(kmem_cache_node, n); } } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); } static int slab_mem_going_online_callback(void *arg) @@ -3598,7 +3502,7 @@ static int slab_mem_going_online_callback(void *arg) struct kmem_cache_node *n; struct kmem_cache *s; struct memory_notify *marg = arg; - int nid = marg->status_change_nid; + int nid = marg->status_change_nid_normal; int ret = 0; /* @@ -3613,7 +3517,7 @@ static int slab_mem_going_online_callback(void *arg) * allocate a kmem_cache_node structure in order to bring the node * online. */ - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { /* * XXX: kmem_cache_alloc_node will fallback to other nodes @@ -3625,11 +3529,11 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n, s); + init_kmem_cache_node(n); s->node[nid] = n; } out: - up_read(&slub_lock); + mutex_unlock(&slab_mutex); return ret; } @@ -3660,7 +3564,10 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -#endif /* CONFIG_MEMORY_HOTPLUG */ +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; /******************************************************************** * Basic setup of slabs @@ -3668,190 +3575,85 @@ static int slab_memory_callback(struct notifier_block *self, /* * Used for early kmem_cache structures that were allocated using - * the page allocator + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. */ -static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); - list_add(&s->list, &slab_caches); - s->refcount = -1; + memcpy(s, static_cache, kmem_cache->object_size); + /* + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ + __flush_cpu_slab(s, smp_processor_id()); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); struct page *p; if (n) { list_for_each_entry(p, &n->partial, lru) - p->slab = s; + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG list_for_each_entry(p, &n->full, lru) - p->slab = s; + p->slab_cache = s; #endif } } + list_add(&s->list, &slab_caches); + return s; } void __init kmem_cache_init(void) { - int i; - int caches = 0; - struct kmem_cache *temp_kmem_cache; - int order; - struct kmem_cache *temp_kmem_cache_node; - unsigned long kmalloc_size; + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; if (debug_guardpage_minorder()) slub_max_order = 0; - kmem_size = offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *); - - /* Allocate two kmem_caches from the page allocator */ - kmalloc_size = ALIGN(kmem_size, cache_line_size()); - order = get_order(2 * kmalloc_size); - kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); - - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - kmem_cache_node = (void *)kmem_cache + kmalloc_size; + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; - kmem_cache_open(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + register_hotmemory_notifier(&slab_memory_callback_nb); /* Able to allocate the per node structures */ slab_state = PARTIAL; - temp_kmem_cache = kmem_cache; - kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache, temp_kmem_cache, kmem_size); + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + + kmem_cache = bootstrap(&boot_kmem_cache); /* * Allocate kmem_cache_node properly from the kmem_cache slab. * kmem_cache_node is separately allocated so no need to * update any list pointers. */ - temp_kmem_cache_node = kmem_cache_node; - - kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); - memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); - - kmem_cache_bootstrap_fixup(kmem_cache_node); - - caches++; - kmem_cache_bootstrap_fixup(kmem_cache); - caches++; - /* Free temporary boot structure */ - free_pages((unsigned long)temp_kmem_cache, order); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ - - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || - (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); - if (elem >= ARRAY_SIZE(size_index)) - break; - size_index[elem] = KMALLOC_SHIFT_LOW; - } - - if (KMALLOC_MIN_SIZE == 64) { - /* - * The 96 byte size cache is not used if the alignment - * is 64 byte. - */ - for (i = 64 + 8; i <= 96; i += 8) - size_index[size_index_elem(i)] = 7; - } else if (KMALLOC_MIN_SIZE == 128) { - /* - * The 192 byte sized cache is not used if the alignment - * is 128 byte. Redirect kmalloc to use the 256 byte cache - * instead. - */ - for (i = 128 + 8; i <= 192; i += 8) - size_index[size_index_elem(i)] = 8; - } - - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); - caches++; - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); - caches++; - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); - caches++; - } - - slab_state = UP; - - /* Provide the correct kmalloc names now that the caches are up */ - if (KMALLOC_MIN_SIZE <= 32) { - kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[1]->name); - } - - if (KMALLOC_MIN_SIZE <= 64) { - kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); - BUG_ON(!kmalloc_caches[2]->name); - } - - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); - - BUG_ON(!s); - kmalloc_caches[i]->name = s; - } + create_kmalloc_caches(0); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif -#ifdef CONFIG_ZONE_DMA - for (i = 0; i < SLUB_PAGE_SHIFT; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - - if (s && s->size) { - char *name = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", s->objsize); - - BUG_ON(!name); - kmalloc_dma_caches[i] = create_kmalloc_cache(name, - s->objsize, SLAB_CACHE_DMA); - } - } -#endif printk(KERN_INFO - "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", - caches, cache_line_size(), + cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } @@ -3868,6 +3670,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3880,9 +3685,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3905,7 +3709,7 @@ static struct kmem_cache *find_mergeable(size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3921,73 +3725,70 @@ static struct kmem_cache *find_mergeable(size_t size, return NULL; } -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - char *n; - if (WARN_ON(!name)) - return NULL; - - down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->objsize = max(s->objsize, (int)size); + s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; - goto err; + s = NULL; } - up_write(&slub_lock); - return s; } - n = kstrdup(name, GFP_KERNEL); - if (!n) - goto err; + return s; +} - s = kmalloc(kmem_size, GFP_KERNEL); - if (s) { - if (kmem_cache_open(s, n, - size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); - up_write(&slub_lock); - if (sysfs_slab_add(s)) { - down_write(&slub_lock); - list_del(&s->list); - kfree(n); - kfree(s); - goto err; - } - return s; - } - kfree(n); - kfree(s); - } -err: - up_write(&slub_lock); +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; - if (flags & SLAB_PANIC) - panic("Cannot create slabcache %s\n", name); - else - s = NULL; - return s; + err = kmem_cache_open(s, flags); + if (err) + return err; + + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); + err = sysfs_slab_add(s); + if (err) + kmem_cache_close(s); + + return err; } -EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_SMP /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, +static int slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3999,13 +3800,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - down_read(&slub_lock); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); } - up_read(&slub_lock); + mutex_unlock(&slab_mutex); break; default: break; @@ -4013,7 +3814,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = { +static struct notifier_block slab_notifier = { .notifier_call = slab_cpuup_callback }; @@ -4024,15 +3825,15 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return kmalloc_large(size, gfpflags); - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); + ret = slab_alloc(s, gfpflags, caller); /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -4047,7 +3848,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) { + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, gfpflags, node); trace_kmalloc_node(caller, ret, @@ -4057,12 +3858,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } - s = get_slab(size, gfpflags); + s = kmalloc_slab(size, gfpflags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc_node(s, gfpflags, node, caller); /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); @@ -4379,15 +4180,17 @@ static int list_locations(struct kmem_cache *s, char *buf, !cpumask_empty(to_cpumask(l->cpus)) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, + len += cpulist_scnprintf(buf + len, + PAGE_SIZE - len - 50, to_cpumask(l->cpus)); } if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && len < PAGE_SIZE - 60) { len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->nodes); + len += nodelist_scnprintf(buf + len, + PAGE_SIZE - len - 50, + l->nodes); } len += sprintf(buf + len, "\n"); @@ -4406,7 +4209,7 @@ static void resiliency_test(void) { u8 *p; - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); printk(KERN_ERR "SLUB resiliency testing\n"); printk(KERN_ERR "-----------------------\n"); @@ -4485,43 +4288,47 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int node; int x; unsigned long *nodes; - unsigned long *per_cpu; - nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); if (!nodes) return -ENOMEM; - per_cpu = nodes + nr_node_ids; if (flags & SO_CPU) { int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - int node = ACCESS_ONCE(c->node); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); + int node; struct page *page; - if (node < 0) - continue; page = ACCESS_ONCE(c->page); + if (!page) + continue; + + node = page_to_nid(page); + if (flags & SO_TOTAL) + x = page->objects; + else if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + + total += x; + nodes[node] += x; + + page = ACCESS_ONCE(c->partial); if (page) { + node = page_to_nid(page); if (flags & SO_TOTAL) - x = page->objects; + WARN_ON_ONCE(1); else if (flags & SO_OBJECTS) - x = page->inuse; + WARN_ON_ONCE(1); else - x = 1; - - total += x; - nodes[node] += x; - } - page = c->partial; - - if (page) { - x = page->pobjects; + x = page->pages; total += x; nodes[node] += x; } - per_cpu[node]++; } } @@ -4531,12 +4338,11 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); - if (flags & SO_TOTAL) - x = atomic_long_read(&n->total_objects); - else if (flags & SO_OBJECTS) - x = atomic_long_read(&n->total_objects) - - count_partial(n, count_free); - + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); else x = atomic_long_read(&n->nr_slabs); total += x; @@ -4620,7 +4426,7 @@ SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objsize); + return sprintf(buf, "%d\n", s->object_size); } SLAB_ATTR_RO(object_size); @@ -4636,7 +4442,7 @@ static ssize_t order_store(struct kmem_cache *s, unsigned long order; int err; - err = strict_strtoul(buf, 10, &order); + err = kstrtoul(buf, 10, &order); if (err) return err; @@ -4664,7 +4470,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; @@ -4684,10 +4490,10 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, unsigned long objects; int err; - err = strict_strtoul(buf, 10, &objects); + err = kstrtoul(buf, 10, &objects); if (err) return err; - if (objects && kmem_cache_debug(s)) + if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; s->cpu_partial = objects; @@ -5000,7 +4806,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, unsigned long ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtoul(buf, 10, &ratio); if (err) return err; @@ -5204,16 +5010,101 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + int i; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache_index(i) { + struct kmem_cache *c = cache_from_memcg_idx(s, i); + if (c) + attribute->store(c, buf, len); + } + mutex_unlock(&slab_mutex); + } +#endif return err; } -static void kmem_cache_release(struct kobject *kobj) +static void memcg_propagate_slab_attrs(struct kmem_cache *s) { - struct kmem_cache *s = to_slab(kobj); +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + struct kmem_cache *root_cache; + + if (is_root_cache(s)) + return; + + root_cache = s->memcg_params->root_cache; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!root_cache->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} - kfree(s->name); - kfree(s); +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); } static const struct sysfs_ops slab_sysfs_ops = { @@ -5223,7 +5114,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, - .release = kmem_cache_release + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5241,6 +5132,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5273,6 +5173,13 @@ static char *create_unique_id(struct kmem_cache *s) if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + p += sprintf(p, "-%08d", + memcg_cache_id(s->memcg_params->memcg)); +#endif + BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5281,13 +5188,8 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; - int unmergeable; + int unmergeable = slab_unmergeable(s); - if (slab_state < SYSFS) - /* Defer until later */ - return 0; - - unmergeable = slab_unmergeable(s); if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5304,37 +5206,53 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); - if (err) { - kobject_put(&s->kobj); - return err; - } + s->kobj.kset = cache_kset(s); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { - if (slab_state < SYSFS) + if (slab_state < FULL) /* * Sysfs has not been setup yet so no need to remove the * cache from sysfs. */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -5356,7 +5274,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; - if (slab_state == SYSFS) { + if (slab_state == FULL) { /* * If we have a leftover link then remove it. */ @@ -5380,16 +5298,16 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; - down_write(&slub_lock); + mutex_lock(&slab_mutex); slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { - up_write(&slub_lock); + mutex_unlock(&slab_mutex); printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } - slab_state = SYSFS; + slab_state = FULL; list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); @@ -5405,11 +5323,11 @@ static int __init slab_sysfs_init(void) err = sysfs_slab_alias(al->s, al->name); if (err) printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", s->name); + " %s to sysfs\n", al->name); kfree(al); } - up_write(&slub_lock); + mutex_unlock(&slab_mutex); resiliency_test(); return 0; } @@ -5421,96 +5339,39 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO -static void print_slabinfo_header(struct seq_file *m) -{ - seq_puts(m, "slabinfo - version: 2.1\n"); - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); - seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); - seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); - seq_putc(m, '\n'); -} - -static void *s_start(struct seq_file *m, loff_t *pos) -{ - loff_t n = *pos; - - down_read(&slub_lock); - if (!n) - print_slabinfo_header(m); - - return seq_list_start(&slab_caches, *pos); -} - -static void *s_next(struct seq_file *m, void *p, loff_t *pos) +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - up_read(&slub_lock); -} - -static int s_show(struct seq_file *m, void *p) -{ - unsigned long nr_partials = 0; unsigned long nr_slabs = 0; - unsigned long nr_inuse = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; - struct kmem_cache *s; int node; - s = list_entry(p, struct kmem_cache, list); - for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); if (!n) continue; - nr_partials += n->nr_partial; - nr_slabs += atomic_long_read(&n->nr_slabs); - nr_objs += atomic_long_read(&n->total_objects); + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); } - nr_inuse = nr_objs - nr_free; - - seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, oo_objects(s->oo), - (1 << oo_order(s->oo))); - seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); - seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, - 0UL); - seq_putc(m, '\n'); - return 0; + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); } -static const struct seq_operations slabinfo_op = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int slabinfo_open(struct inode *inode, struct file *file) +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) { - return seq_open(file, &slabinfo_op); } -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init slab_proc_init(void) +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { - proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); - return 0; + return -EIO; } -module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 1b7e22ab9b09..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); + return memblock_virt_alloc_try_nid(size, align, goal, + BOOTMEM_ALLOC_ACCESSIBLE, node); } static void *vmemmap_buf; @@ -53,10 +54,12 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) struct page *page; if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, get_order(size)); + page = alloc_pages_node( + node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + get_order(size)); else - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + page = alloc_pages( + GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, get_order(size)); if (page) return page_address(page); @@ -145,11 +148,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) return pgd; } -int __meminit vmemmap_populate_basepages(struct page *start_page, - unsigned long size, int node) +int __meminit vmemmap_populate_basepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr = start; pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -176,9 +178,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page, struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) { - struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); - int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); - if (error) + unsigned long start; + unsigned long end; + struct page *map; + + map = pfn_to_page(pnum * PAGES_PER_SECTION); + start = (unsigned long)map; + end = (unsigned long)(map + PAGES_PER_SECTION); + + if (vmemmap_populate(start, end, nid)) return NULL; return map; @@ -219,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (vmemmap_buf_start) { /* need to free left buf */ - free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); + memblock_free_early(__pa(vmemmap_buf), + vmemmap_buf_end - vmemmap_buf); vmemmap_buf = NULL; vmemmap_buf_end = NULL; } diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..d1b48b691ac8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,10 +5,12 @@ #include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> + #include "internal.h" #include <asm/dma.h> #include <asm/pgalloc.h> @@ -65,24 +67,20 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) if (slab_is_available()) { if (node_state(nid, N_HIGH_MEMORY)) - section = kmalloc_node(array_size, GFP_KERNEL, nid); + section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = kmalloc(array_size, GFP_KERNEL); - } else - section = alloc_bootmem_node(NODE_DATA(nid), array_size); - - if (section) - memset(section, 0, array_size); + section = kzalloc(array_size, GFP_KERNEL); + } else { + section = memblock_virt_alloc_node(array_size, nid); + } return section; } static int __meminit sparse_index_init(unsigned long section_nr, int nid) { - static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; - int ret = 0; if (mem_section[root]) return -EEXIST; @@ -90,21 +88,10 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) section = sparse_index_alloc(nid); if (!section) return -ENOMEM; - /* - * This lock keeps two different sections from - * reallocating for the same index - */ - spin_lock(&index_init_lock); - - if (mem_section[root]) { - ret = -EEXIST; - goto out; - } mem_section[root] = section; -out: - spin_unlock(&index_init_lock); - return ret; + + return 0; } #else /* !SPARSEMEM_EXTREME */ static inline int sparse_index_init(unsigned long section_nr, int nid) @@ -132,6 +119,8 @@ int __section_nr(struct mem_section* ms) break; } + VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } @@ -273,22 +262,33 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - unsigned long section_nr; - + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while - * other sections referencing the usemap retmain active. Similarly, + * other sections referencing the usemap remain active. Similarly, * a pgdat can prevent a section being removed. If section A * contains a pgdat and section B contains the usemap, both * sections become inter-dependent. This allocates usemaps * from the same section as the pgdat where possible to avoid * this problem. */ - section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size() * count, section_nr); + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = memblock_virt_alloc_try_nid_nopanic(size, + SMP_CACHE_BYTES, goal, limit, + nid); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -332,9 +332,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) #else static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, - unsigned long count) + unsigned long size) { - return NULL; + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -342,23 +342,21 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, +static void __init sparse_early_usemaps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long usemap_count, int nodeid) { void *usemap; unsigned long pnum; + unsigned long **usemap_map = (unsigned long **)data; int size = usemap_size(); usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - usemap_count); + size * usemap_count); if (!usemap) { - usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); - if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); - return; - } + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; } for (pnum = pnum_begin; pnum < pnum_end; pnum++) { @@ -381,8 +379,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) return map; size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); - map = __alloc_bootmem_node_high(NODE_DATA(nid), size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); return map; } void __init sparse_mem_maps_populate_node(struct page **map_map, @@ -406,8 +405,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + map = memblock_virt_alloc_try_nid(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) @@ -436,11 +436,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, +static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count, int nodeid) { + struct page **map_map = (struct page **)data; sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, map_count, nodeid); } @@ -462,45 +463,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) } #endif -void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +void __weak __meminit vmemmap_populate_print_last(void) { } -/* - * Allocate the accumulated non-linear sections, allocate a mem_map - * for each and record the physical to section mapping. +/** + * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap + * @map: usemap_map for pageblock flags or mmap_map for vmemmap */ -void __init sparse_init(void) +static void __init alloc_usemap_and_memmap(void (*alloc_func) + (void *, unsigned long, unsigned long, + unsigned long, int), void *data) { unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; + unsigned long map_count; int nodeid_begin = 0; unsigned long pnum_begin = 0; - unsigned long usemap_count; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - unsigned long map_count; - int size2; - struct page **map_map; -#endif - - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = alloc_bootmem(size); - if (!usemap_map) - panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { struct mem_section *ms; @@ -512,7 +490,7 @@ void __init sparse_init(void) pnum_begin = pnum; break; } - usemap_count = 1; + map_count = 1; for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { struct mem_section *ms; int nodeid; @@ -522,61 +500,69 @@ void __init sparse_init(void) ms = __nr_to_section(pnum); nodeid = sparse_early_nid(ms); if (nodeid == nodeid_begin) { - usemap_count++; + map_count++; continue; } /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, - usemap_count, nodeid_begin); + alloc_func(data, pnum_begin, pnum, + map_count, nodeid_begin); /* new start, update count etc*/ nodeid_begin = nodeid; pnum_begin = pnum; - usemap_count = 1; + map_count = 1; } /* ok, last chunk */ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, - usemap_count, nodeid_begin); + alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +} +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum; + struct page *map; + unsigned long *usemap; + unsigned long **usemap_map; + int size; #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = alloc_bootmem(size2); - if (!map_map) - panic("can not allocate map_map\n"); + int size2; + struct page **map_map; +#endif - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - int nodeid; + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - map_count = 1; - } - /* ok, last chunk */ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, - map_count, nodeid_begin); + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) + * so alloc 2M (with 2M align) and 24 bytes in turn will + * make next 2M slip to one more 2M later. + * then in big system, the memory will have a lot of holes... + * here try to allocate 2M pages continuously. + * + * powerpc need to call sparse_init_one_section right after each + * sparse_early_mem_map_alloc, so allocate usemap_map at first. + */ + size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + usemap_map = memblock_virt_alloc(size, 0); + if (!usemap_map) + panic("can not allocate usemap_map\n"); + alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, + (void *)usemap_map); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = memblock_virt_alloc(size2, 0); + if (!map_map) + panic("can not allocate map_map\n"); + alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, + (void *)map_map); #endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { @@ -602,31 +588,39 @@ void __init sparse_init(void) vmemmap_populate_print_last(); #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - free_bootmem(__pa(map_map), size2); + memblock_free_early(__pa(map_map), size2); #endif - free_bootmem(__pa(usemap_map), size); + memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { /* This will make the necessary allocations eventually. */ return sparse_mem_map_populate(pnum, nid); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { - return; /* XXX: Not implemented yet */ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end); } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +#ifdef CONFIG_MEMORY_HOTREMOVE +static void free_map_bootmem(struct page *memmap) { + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end); } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #else -static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +static struct page *__kmalloc_section_memmap(void) { struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * nr_pages; + unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); if (page) @@ -640,30 +634,33 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) got_map_page: ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); got_map_ptr: - memset(ret, 0, memmap_size); return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { - return __kmalloc_section_memmap(nr_pages); + return __kmalloc_section_memmap(); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * nr_pages)); + get_order(sizeof(struct page) * PAGES_PER_SECTION)); } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +#ifdef CONFIG_MEMORY_HOTREMOVE +static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic; + unsigned long magic, nr_pages; + struct page *page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -685,50 +682,15 @@ static void free_map_bootmem(struct page *page, unsigned long nr_pages) put_page_bootmem(page); } } +#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static void free_section_usemap(struct page *memmap, unsigned long *usemap) -{ - struct page *usemap_page; - unsigned long nr_pages; - - if (!usemap) - return; - - usemap_page = virt_to_page(usemap); - /* - * Check to see if allocation came from hot-plug-add - */ - if (PageSlab(usemap_page)) { - kfree(usemap); - if (memmap) - __kfree_section_memmap(memmap, PAGES_PER_SECTION); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) { - struct page *memmap_page; - memmap_page = virt_to_page(memmap); - - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - free_map_bootmem(memmap_page, nr_pages); - } -} - /* * returns the number of sections whose mem_maps were properly * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages) +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct pglist_data *pgdat = zone->zone_pgdat; @@ -745,12 +707,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); if (!memmap) return -ENOMEM; usemap = __kmalloc_section_usemap(); if (!usemap) { - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); return -ENOMEM; } @@ -762,6 +724,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } + memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); + ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); @@ -770,16 +734,67 @@ out: pgdat_resize_unlock(pgdat, &flags); if (ret <= 0) { kfree(usemap); - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); } return ret; } +#ifdef CONFIG_MEMORY_HOTREMOVE +#ifdef CONFIG_MEMORY_FAILURE +static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i; + + if (!memmap) + return; + + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageHWPoison(&memmap[i])) { + atomic_long_sub(1, &num_poisoned_pages); + ClearPageHWPoison(&memmap[i]); + } + } +} +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page) || PageCompound(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) + free_map_bootmem(memmap); +} + void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; - unsigned long *usemap = NULL; + unsigned long *usemap = NULL, flags; + struct pglist_data *pgdat = zone->zone_pgdat; + pgdat_resize_lock(pgdat, &flags); if (ms->section_mem_map) { usemap = ms->pageblock_flags; memmap = sparse_decode_mem_map(ms->section_mem_map, @@ -787,7 +802,10 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) ms->section_mem_map = 0; ms->pageblock_flags = NULL; } + pgdat_resize_unlock(pgdat, &flags); + clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); } -#endif +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 5c13f1338972..9ce43ba4498b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,13 +30,17 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> +#include <linux/uio.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/pagemap.h> + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); @@ -47,13 +51,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - unsigned long flags; struct zone *zone = page_zone(page); + struct lruvec *lruvec; + unsigned long flags; spin_lock_irqsave(&zone->lru_lock, flags); - VM_BUG_ON(!PageLRU(page)); + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_off_lru(page)); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } } @@ -75,62 +81,150 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - if (unlikely(PageTail(page))) { - /* __split_huge_page_refcount can run under us */ - struct page *page_head = compound_trans_head(page); + struct page *page_head; - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - unsigned long flags; + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - VM_BUG_ON(PageHead(page_head)); - if (put_page_testzero(page_head)) - __put_single_page(page_head); - out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); + } + return; + } + + /* __split_huge_page_refcount can run under us */ + page_head = compound_head(page); + + /* + * THP can not break up slab pages so avoid taking + * compound_lock() and skip the tail page refcounting (in + * _mapcount) too. Slab performs non-atomic bit ops on + * page->flags for better performance. In particular + * slab_unlock() in slub used to be a hot path. It is still + * hot on arches that do not support + * this_cpu_cmpxchg_double(). + * + * If "page" is part of a slab or hugetlbfs page it cannot be + * splitted and the head page cannot change from under us. And + * if "page" is part of a THP page under splitting, if the + * head page pointed by the THP tail isn't a THP head anymore, + * we'll find PageTail clear after smp_rmb() and we'll treat + * it as a single page. + */ + if (!__compound_tail_refcounted(page_head)) { + /* + * If "page" is a THP tail, we must read the tail page + * flags after the head page flags. The + * split_huge_page side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount cannot race + * here. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab + * compound page, the tail pin must + * not be the last reference held on + * the page, because the PG_slab + * cannot be cleared before all tail + * pins (which skips the _mapcount + * tail refcounting) have been + * released. For hugetlbfs the tail + * pin may be the last reference on + * the page instead, because + * PageHeadHuge will not go away until + * the compound page enters the buddy + * allocator. + */ + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); } - VM_BUG_ON(page_head != page->first_page); + return; + } else /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on - * the compound_lock. + * __split_huge_page_refcount run before us, + * "page" was a THP tail. The split page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). */ - if (put_page_testzero(page_head)) - VM_BUG_ON(1); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(page_mapcount(page) <= 0); - atomic_dec(&page->_mapcount); - VM_BUG_ON(atomic_read(&page_head->_count) <= 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); + goto out_put_single; + } + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + + /* + * page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { + /* + * The head page may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ if (PageHead(page_head)) __put_compound_page(page_head); else __put_single_page(page_head); } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON(PageTail(page)); - goto out_put_single; +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; } - } else if (put_page_testzero(page)) { - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); + VM_BUG_ON_PAGE(page_head != page->first_page, page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON_PAGE(1, page_head); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); + atomic_dec(&page->_mapcount); + VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); + VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON_PAGE(PageTail(page), page); + goto out_put_single; } } @@ -158,9 +252,36 @@ bool __get_page_tail(struct page *page) * split_huge_page(). */ unsigned long flags; - bool got = false; - struct page *page_head = compound_trans_head(page); + bool got; + struct page *page_head = compound_head(page); + /* Ref to put_compound_page() comment. */ + if (!__compound_tail_refcounted(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + __get_page_tail_foll(page, true); + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ + return false; + } + } + + got = false; if (likely(page != page_head && get_page_unless_zero(page_head))) { /* * page_head wasn't a dangling pointer but it @@ -201,12 +322,65 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); +/* + * get_kernel_pages() - pin kernel pages in memory + * @kiov: An array of struct kvec structures + * @nr_segs: number of segments to pin + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_segs long. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. + */ +int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, + struct page **pages) +{ + int seg; + + for (seg = 0; seg < nr_segs; seg++) { + if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) + return seg; + + pages[seg] = kmap_to_page(kiov[seg].iov_base); + page_cache_get(pages[seg]); + } + + return seg; +} +EXPORT_SYMBOL_GPL(get_kernel_pages); + +/* + * get_kernel_page() - pin a kernel page in memory + * @start: starting kernel address + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointer to the page pinned. + * Must be at least nr_segs long. + * + * Returns 1 if page is pinned. If the page was not pinned, returns + * -errno. The page returned must be released with a put_page() call + * when it is finished with. + */ +int get_kernel_page(unsigned long start, int write, struct page **pages) +{ + const struct kvec kiov = { + .iov_base = (void *)start, + .iov_len = PAGE_SIZE + }; + + return get_kernel_pages(&kiov, 1, write, pages); +} +EXPORT_SYMBOL_GPL(get_kernel_page); + static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, void *arg), - void *arg) + void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), + void *arg) { int i; struct zone *zone = NULL; + struct lruvec *lruvec; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { @@ -220,7 +394,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, spin_lock_irqsave(&zone->lru_lock, flags); } - (*move_fn)(page, arg); + lruvec = mem_cgroup_page_lruvec(page, zone); + (*move_fn)(page, lruvec, arg); } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -228,16 +403,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, pagevec_reinit(pvec); } -static void pagevec_move_tail_fn(struct page *page, void *arg) +static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, + void *arg) { int *pgmoved = arg; if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); - struct lruvec *lruvec; - - lruvec = mem_cgroup_lru_move_lists(page_zone(page), - page, lru, lru); list_move_tail(&page->lru, &lruvec->lists[lru]); (*pgmoved)++; } @@ -276,41 +448,31 @@ void rotate_reclaimable_page(struct page *page) } } -static void update_page_reclaim_stat(struct zone *zone, struct page *page, +static void update_page_reclaim_stat(struct lruvec *lruvec, int file, int rotated) { - struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; - struct zone_reclaim_stat *memcg_reclaim_stat; - - memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; reclaim_stat->recent_scanned[file]++; if (rotated) reclaim_stat->recent_rotated[file]++; - - if (!memcg_reclaim_stat) - return; - - memcg_reclaim_stat->recent_scanned[file]++; - if (rotated) - memcg_reclaim_stat->recent_rotated[file]++; } -static void __activate_page(struct page *page, void *arg) +static void __activate_page(struct page *page, struct lruvec *lruvec, + void *arg) { - struct zone *zone = page_zone(page); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); int lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + add_page_to_lru_list(page, lruvec, lru); + trace_mm_lru_activate(page, page_to_pfn(page)); - update_page_reclaim_stat(zone, page, file, 1); + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(lruvec, file, 1); } } @@ -325,6 +487,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -342,16 +509,48 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - __activate_page(page, NULL); + __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); spin_unlock_irq(&zone->lru_lock); } #endif +static void __lru_cache_activate_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + int i; + + /* + * Search backwards on the optimistic assumption that the page being + * activated has just been added to this pagevec. Note that only + * the local pagevec is examined as a !PageLRU page could be in the + * process of being released, reclaimed, migrated or on a remote + * pagevec that is currently being drained. Furthermore, marking + * a remote pagevec's page PageActive potentially hits a race where + * a page is marked PageActive just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = pagevec_count(pvec) - 1; i >= 0; i--) { + struct page *pagevec_page = pvec->pages[i]; + + if (pagevec_page == page) { + SetPageActive(page); + break; + } + } + + put_cpu_var(lru_add_pvec); +} + /* * Mark a page as having seen activity. * @@ -362,43 +561,54 @@ void activate_page(struct page *page) void mark_page_accessed(struct page *page) { if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page) && PageLRU(page)) { - activate_page(page); + PageReferenced(page)) { + + /* + * If the page is on the LRU, queue it for activation via + * activate_page_pvecs. Otherwise, assume the page is on a + * pagevec, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (PageLRU(page)) + activate_page(page); + else + __lru_cache_activate_page(page); ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); } } EXPORT_SYMBOL(mark_page_accessed); -void __lru_cache_add(struct page *page, enum lru_list lru) +/* + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of __lru_cache_add() + * have the page added to the active list using mark_page_accessed(). + */ +void __lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add(pvec, lru); - put_cpu_var(lru_add_pvecs); + if (!pagevec_space(pvec)) + __pagevec_lru_add(pvec); + pagevec_add(pvec, page); + put_cpu_var(lru_add_pvec); } EXPORT_SYMBOL(__lru_cache_add); /** - * lru_cache_add_lru - add a page to a page list + * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. - * @lru: the LRU list to which the page is added. */ -void lru_cache_add_lru(struct page *page, enum lru_list lru) +void lru_cache_add(struct page *page) { - if (PageActive(page)) { - VM_BUG_ON(PageUnevictable(page)); - ClearPageActive(page); - } else if (PageUnevictable(page)) { - VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); - } - - VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); - __lru_cache_add(page, lru); + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + __lru_cache_add(page); } /** @@ -414,11 +624,14 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) void add_page_to_unevictable_list(struct page *page) { struct zone *zone = page_zone(page); + struct lruvec *lruvec; spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); - add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); spin_unlock_irq(&zone->lru_lock); } @@ -443,11 +656,11 @@ void add_page_to_unevictable_list(struct page *page) * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_fn(struct page *page, void *arg) +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) { int lru, file; bool active; - struct zone *zone = page_zone(page); if (!PageLRU(page)) return; @@ -460,13 +673,13 @@ static void lru_deactivate_fn(struct page *page, void *arg) return; active = PageActive(page); - file = page_is_file_cache(page); lru = page_lru_base_type(page); - del_page_from_lru_list(zone, page, lru + active); + + del_page_from_lru_list(page, lruvec, lru + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(zone, page, lru); + add_page_to_lru_list(page, lruvec, lru); if (PageWriteback(page) || PageDirty(page)) { /* @@ -476,19 +689,17 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ SetPageReclaim(page); } else { - struct lruvec *lruvec; /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru); list_move_tail(&page->lru, &lruvec->lists[lru]); __count_vm_event(PGROTATED); } if (active) __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(zone, page, file, 0); + update_page_reclaim_stat(lruvec, file, 0); } /* @@ -498,15 +709,10 @@ static void lru_deactivate_fn(struct page *page, void *arg) */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; - int lru; + struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); - for_each_lru(lru) { - pvec = &pvecs[lru - LRU_BASE]; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec, lru); - } + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -562,12 +768,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_drain(); } -/* - * Returns 0 for success - */ -int lru_add_drain_all(void) +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +void lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + static DEFINE_MUTEX(lock); + static struct cpumask has_work; + int cpu; + + mutex_lock(&lock); + get_online_cpus(); + cpumask_clear(&has_work); + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + + put_online_cpus(); + mutex_unlock(&lock); } /* @@ -588,6 +818,7 @@ void release_pages(struct page **pages, int nr, int cold) int i; LIST_HEAD(pages_to_free); struct zone *zone = NULL; + struct lruvec *lruvec; unsigned long uninitialized_var(flags); for (i = 0; i < nr; i++) { @@ -615,11 +846,16 @@ void release_pages(struct page **pages, int nr, int cold) zone = pagezone; spin_lock_irqsave(&zone->lru_lock, flags); } - VM_BUG_ON(!PageLRU(page)); + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(zone, page, page_off_lru(page)); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); } + /* Clear Active bit in case of parallel mark_page_accessed */ + ClearPageActive(page); + list_add(&page->lru, &pages_to_free); } if (zone) @@ -649,37 +885,27 @@ EXPORT_SYMBOL(__pagevec_release); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* used by __split_huge_page_refcount() */ -void lru_add_page_tail(struct zone* zone, - struct page *page, struct page *page_tail) +void lru_add_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec, struct list_head *list) { - int uninitialized_var(active); - enum lru_list lru; const int file = 0; - VM_BUG_ON(!PageHead(page)); - VM_BUG_ON(PageCompound(page_tail)); - VM_BUG_ON(PageLRU(page_tail)); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock)); - - SetPageLRU(page_tail); + VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(PageCompound(page_tail), page); + VM_BUG_ON_PAGE(PageLRU(page_tail), page); + VM_BUG_ON(NR_CPUS != 1 && + !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); - if (page_evictable(page_tail, NULL)) { - if (PageActive(page)) { - SetPageActive(page_tail); - active = 1; - lru = LRU_ACTIVE_ANON; - } else { - active = 0; - lru = LRU_INACTIVE_ANON; - } - } else { - SetPageUnevictable(page_tail); - lru = LRU_UNEVICTABLE; - } + if (!list) + SetPageLRU(page_tail); if (likely(PageLRU(page))) list_add_tail(&page_tail->lru, &page->lru); - else { + else if (list) { + /* page reclaim is reclaiming a huge page */ + get_page(page_tail); + list_add_tail(&page_tail->lru, list); + } else { struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, @@ -688,47 +914,93 @@ void lru_add_page_tail(struct zone* zone, * Use the standard add function to put page_tail on the list, * but then correct its position so they all end up in order. */ - add_page_to_lru_list(zone, page_tail, lru); + add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } if (!PageUnevictable(page)) - update_page_reclaim_stat(zone, page_tail, file, active); + update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __pagevec_lru_add_fn(struct page *page, void *arg) +static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, + void *arg) { - enum lru_list lru = (enum lru_list)arg; - struct zone *zone = page_zone(page); - int file = is_file_lru(lru); - int active = is_active_lru(lru); + int file = page_is_file_cache(page); + int active = PageActive(page); + enum lru_list lru = page_lru(page); - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); - if (active) - SetPageActive(page); - add_page_to_lru_list(zone, page, lru); - update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(page, lruvec, lru); + update_page_reclaim_stat(lruvec, file, active); + trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); } /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec) { - VM_BUG_ON(is_unevictable_lru(lru)); - - pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); } EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search @@ -767,9 +1039,15 @@ EXPORT_SYMBOL(pagevec_lookup_tag); void __init swap_setup(void) { unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); - #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + if (bdi_init(swapper_spaces[0].backing_dev_info)) + panic("Failed to init swap bdi"); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f284d9..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> +#include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> #include <linux/page_cgroup.h> @@ -26,7 +27,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .set_page_dirty = swap_set_page_dirty, .migratepage = migrate_page, }; @@ -35,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -52,13 +53,26 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", + get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } @@ -66,26 +80,29 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(PageSwapCache(page)); - VM_BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); page_cache_get(page); SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -107,7 +124,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_preload(gfp_mask); + error = radix_tree_maybe_preload(gfp_mask); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -121,14 +138,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageSwapCache(page)); - VM_BUG_ON(PageWriteback(page)); + swp_entry_t entry; + struct address_space *address_space; - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(PageWriteback(page), page); + + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -140,20 +162,20 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page) +int add_to_swap(struct page *page, struct list_head *list) { swp_entry_t entry; int err; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageUptodate(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); entry = get_swap_page(); if (!entry.val) return 0; if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) { + if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry, NULL); return 0; } @@ -194,12 +216,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -262,10 +286,13 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -289,7 +316,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), + entry.val); if (found_page) break; @@ -305,7 +333,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; @@ -313,8 +341,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ + if (err == -EEXIST) { radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ @@ -350,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -373,9 +461,15 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; + struct blk_plug plug; + + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; @@ -383,14 +477,20 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!start_offset) /* First page is swap header. */ start_offset++; + blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } + blk_finish_plug(&plug); + lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..4a7f7e6992b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,6 +31,9 @@ #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> +#include <linux/export.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -42,20 +45,22 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; -long nr_swap_pages; +atomic_long_t nr_swap_pages; +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; +static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -76,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -170,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -static int wait_for_discard(void *word) +#define SWAPFILE_CLUSTER 256 +#define LATENCY_LIMIT 256 + +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) { - schedule(); - return 0; + info->flags = flag; } -#define SWAPFILE_CLUSTER 256 -#define LATENCY_LIMIT 256 +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} + +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) { + VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); + cluster_set_next_flag(&p->free_cluster_head, + cluster_next(&cluster_info[idx]), 0); + if (cluster_next(&p->free_cluster_tail) == idx) { + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->free_cluster_head); + } + cluster_set_count_flag(&cluster_info[idx], 0, 0); + } + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(p, idx); + return; + } + + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } else { + unsigned int tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } + } +} + +/* + * It's possible scan_swap_map() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + unsigned long offset) +{ + struct percpu_cluster *percpu_cluster; + bool conflict; + + offset /= SWAPFILE_CLUSTER; + conflict = !cluster_is_null(&si->free_cluster_head) && + offset != cluster_next(&si->free_cluster_head) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; +} static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -186,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; - int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -202,25 +488,19 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { - /* - * Start range check on racing allocations, in case - * they overlap the cluster we eventually decide on - * (we scan without swap_lock to allow preemption). - * It's hardly conceivable that cluster_nr could be - * wrapped during our scan, but don't depend on it. - */ - if (si->lowest_alloc) - goto checks; - si->lowest_alloc = si->max; - si->highest_alloc = 0; - } - spin_unlock(&swap_lock); + + spin_unlock(&si->lock); /* * If seek is expensive, start searching for new cluster from @@ -239,11 +519,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -260,11 +539,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { - spin_lock(&swap_lock); + spin_lock(&si->lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -274,12 +552,15 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } offset = scan_base; - spin_lock(&swap_lock); + spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; - si->lowest_alloc = 0; } checks: + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -290,9 +571,9 @@ checks: /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - spin_unlock(&swap_lock); + spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); - spin_lock(&swap_lock); + spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed) goto checks; @@ -312,73 +593,21 @@ checks: si->highest_bit = 0; } si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; - if (si->lowest_alloc) { - /* - * Only set when SWP_DISCARDABLE, and there's a scan - * for a free cluster in progress or just completed. - */ - if (found_free_cluster) { - /* - * To optimize wear-levelling, discard the - * old data of the cluster, taking care not to - * discard any of its pages that have already - * been allocated by racing tasks (offset has - * already stepped over any at the beginning). - */ - if (offset < si->highest_alloc && - si->lowest_alloc <= last_in_cluster) - last_in_cluster = si->lowest_alloc - 1; - si->flags |= SWP_DISCARDING; - spin_unlock(&swap_lock); - - if (offset < last_in_cluster) - discard_swap_cluster(si, offset, - last_in_cluster - offset + 1); - - spin_lock(&swap_lock); - si->lowest_alloc = 0; - si->flags &= ~SWP_DISCARDING; - - smp_mb(); /* wake_up_bit advises this */ - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); - - } else if (si->flags & SWP_DISCARDING) { - /* - * Delay using pages allocated by racing tasks - * until the whole discard has been issued. We - * could defer that delay until swap_writepage, - * but it's easier to keep this self-contained. - */ - spin_unlock(&swap_lock); - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), - wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&swap_lock); - } else { - /* - * Note pages allocated by racing tasks while - * scan for a free cluster is in progress, so - * that its final discard can exclude them. - */ - if (offset < si->lowest_alloc) - si->lowest_alloc = offset; - if (offset > si->highest_alloc) - si->highest_alloc = offset; - } - } return offset; scan: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { @@ -387,21 +616,22 @@ scan: } } offset = si->lowest_bit; - while (++offset < scan_base) { + while (offset < scan_base) { if (!si->swap_map[offset]) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - spin_lock(&swap_lock); + spin_lock(&si->lock); goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } + offset++; } - spin_lock(&swap_lock); + spin_lock(&si->lock); no_page: si->flags -= SWP_SCANNING; @@ -414,13 +644,34 @@ swp_entry_t get_swap_page(void) pgoff_t offset; int type, next; int wrapped = 0; + int hp_index; spin_lock(&swap_lock); - if (nr_swap_pages <= 0) + if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { + hp_index = atomic_xchg(&highest_priority_index, -1); + /* + * highest_priority_index records current highest priority swap + * type which just frees swap entries. If its priority is + * higher than that of swap_list.next swap type, we use it. It + * isn't protected by swap_lock, so it can be an invalid value + * if the corresponding swap type is swapoff. We double check + * the flags here. It's even possible the swap type is swapoff + * and swapon again and its priority is changed. In such rare + * case, low prority swap type might be used, but eventually + * high priority swap will be used after several rounds of + * swap. + */ + if (hp_index != -1 && hp_index != type && + swap_info[type]->prio < swap_info[hp_index]->prio && + (swap_info[hp_index]->flags & SWP_WRITEOK)) { + type = hp_index; + swap_list.next = type; + } + si = swap_info[type]; next = si->next; if (next < 0 || @@ -429,46 +680,53 @@ swp_entry_t get_swap_page(void) wrapped++; } - if (!si->highest_bit) + spin_lock(&si->lock); + if (!si->highest_bit) { + spin_unlock(&si->lock); continue; - if (!(si->flags & SWP_WRITEOK)) + } + if (!(si->flags & SWP_WRITEOK)) { + spin_unlock(&si->lock); continue; + } swap_list.next = next; + + spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); - if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); + if (offset) return swp_entry(type, offset); - } + spin_lock(&swap_lock); next = swap_list.next; } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); noswap: spin_unlock(&swap_lock); return (swp_entry_t) {0}; } -/* The only caller of this function is now susupend routine */ +/* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; pgoff_t offset; - spin_lock(&swap_lock); si = swap_info[type]; + spin_lock(&si->lock); if (si && (si->flags & SWP_WRITEOK)) { - nr_swap_pages--; + atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return swp_entry(type, offset); } - nr_swap_pages++; + atomic_long_inc(&nr_swap_pages); } - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return (swp_entry_t) {0}; } @@ -490,24 +748,45 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; - spin_lock(&swap_lock); + spin_lock(&p->lock); return p; bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; } +/* + * This swap type frees swap entry, check if it is the highest priority swap + * type which just frees swap entry. get_swap_page() uses + * highest_priority_index to search highest priority swap type. The + * swap_info_struct.lock can't protect us if there are multiple swap types + * active, so we use atomic_cmpxchg. + */ +static void set_highest_priority_index(int type) +{ + int old_hp_index, new_hp_index; + + do { + old_hp_index = atomic_read(&highest_priority_index); + if (old_hp_index != -1 && + swap_info[old_hp_index]->prio >= swap_info[type]->prio) + break; + new_hp_index = type; + } while (atomic_cmpxchg(&highest_priority_index, + old_hp_index, new_hp_index) != old_hp_index); +} + static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -546,26 +825,28 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { - struct gendisk *disk = p->bdev->bd_disk; + dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (swap_list.next >= 0 && - p->prio > swap_info[swap_list.next]->prio) - swap_list.next = p->type; - nr_swap_pages++; + set_highest_priority_index(p->type); + atomic_long_inc(&nr_swap_pages); p->inuse_pages--; - if ((p->flags & SWP_BLKDEV) && - disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, offset); + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } return usage; } /* - * Caller has made sure that the swapdevice corresponding to entry + * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) @@ -575,7 +856,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { swap_entry_free(p, entry, 1); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -592,7 +873,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) count = swap_entry_free(p, entry, SWAP_HAS_CACHE); if (page) mem_cgroup_uncharge_swapcache(page, entry, count != 0); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } } @@ -601,7 +882,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static inline int page_swapcount(struct page *page) +int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; @@ -611,7 +892,7 @@ static inline int page_swapcount(struct page *page) p = swap_info_get(entry); if (p) { count = swap_count(p->swap_map[swp_offset(entry)]); - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } return count; } @@ -626,7 +907,7 @@ int reuse_swap_page(struct page *page) { int count; - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; count = page_mapcount(page); @@ -646,7 +927,7 @@ int reuse_swap_page(struct page *page) */ int try_to_free_swap(struct page *page) { - VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); if (!PageSwapCache(page)) return 0; @@ -667,7 +948,7 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibration suspends storage while it is writing the image + * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) @@ -693,13 +974,14 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), + entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } } - spin_unlock(&swap_lock); + spin_unlock(&p->lock); } if (page) { /* @@ -717,37 +999,6 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_count_swap_user - count the user of a swap entry - * @ent: the swap entry to be checked - * @pagep: the pointer for the swap cache page of the entry to be stored - * - * Returns the number of the user of the swap entry. The number is valid only - * for swaps of anonymous pages. - * If the entry is found on swap cache, the page is stored to pagep with - * refcount of it being incremented. - */ -int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) -{ - struct page *page; - struct swap_info_struct *p; - int count = 0; - - page = find_get_page(&swapper_space, ent.val); - if (page) - count += page_mapcount(page); - p = swap_info_get(ent); - if (p) { - count += swap_count(p->swap_map[swp_offset(ent)]); - spin_unlock(&swap_lock); - } - - *pagep = page; - return count; -} -#endif - #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -828,17 +1079,34 @@ unsigned int count_swap_pages(int type, int free) if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; + spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= sis->inuse_pages; } + spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -847,11 +1115,17 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + struct page *swapcache; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; + swapcache = page; + page = ksm_might_need_to_copy(page, vma, addr); + if (unlikely(!page)) + return -ENOMEM; + if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &memcg)) { ret = -ENOMEM; @@ -859,9 +1133,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { - if (ret > 0) - mem_cgroup_cancel_charge_swapin(memcg); + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } @@ -871,7 +1144,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - page_add_anon_rmap(page, vma, addr); + if (page == swapcache) + page_add_anon_rmap(page, vma, addr); + else /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, addr); mem_cgroup_commit_charge_swapin(page, memcg); swap_free(entry); /* @@ -882,6 +1158,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, out: pte_unmap_unlock(pte, ptl); out_nolock: + if (page != swapcache) { + unlock_page(page); + put_page(page); + } return ret; } @@ -900,7 +1180,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be - * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { @@ -908,7 +1188,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) @@ -1016,11 +1296,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1046,7 +1327,13 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - count = si->swap_map[i]; + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } + count = ACCESS_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } @@ -1057,12 +1344,20 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned char *swap_map; + volatile unsigned char *swap_map; /* swap_map is accessed without + * locking. Mark it as volatile + * to prevent compiler doing + * something odd. + */ unsigned char swcount; struct page *page; swp_entry_t entry; @@ -1091,7 +1386,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1113,7 +1408,15 @@ static int try_to_unuse(unsigned int type) * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ - if (!*swap_map) + swcount = *swap_map; + /* + * We don't hold lock here, so the swap entry could be + * SWAP_MAP_BAD (when the cluster is discarding). + * Instead of fail out, We can just skip the swap + * entry because swapoff will wait for discarding + * finish anyway. + */ + if (!swcount || swcount == SWAP_MAP_BAD) continue; retval = -ENOMEM; break; @@ -1258,6 +1561,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1341,6 +1648,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) list_del(&se->list); kfree(se); } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + sis->flags &= ~SWP_FILE; + mapping->a_ops->swap_deactivate(swap_file); + } } /* @@ -1349,7 +1664,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) * * This function rather assumes that it is called in ascending page order. */ -static int +int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { @@ -1422,113 +1737,44 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { - struct inode *inode; - unsigned blocks_per_page; - unsigned long page_no; - unsigned blkbits; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; int ret; - inode = sis->swap_file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto out; + return ret; } - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; - - /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. - */ - probe_block = 0; - page_no = 0; - last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - - first_block = bmap(inode, probe_block); - if (first_block == 0) - goto bad_bmap; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } - - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - sector_t block; - - block = bmap(inode, probe_block + block_in_page); - if (block == 0) - goto bad_bmap; - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } - - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; } + return ret; + } - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; -out: - return ret; -bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); - ret = -EINVAL; - goto out; + return generic_swapfile_activate(sis, swap_file, span); } -static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) +static void _enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i, prev; - spin_lock(&swap_lock); if (prio >= 0) p->prio = prio; else p->prio = --least_priority; p->swap_map = swap_map; + p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; - nr_swap_pages += p->pages; + atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; /* insert swap space into swap_list: */ @@ -1543,6 +1789,27 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; +} + +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long *frontswap_map) +{ + frontswap_init(p->type, frontswap_map); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} + +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1550,13 +1817,15 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + struct swap_cluster_info *cluster_info; + unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char *pathname; - int oom_score_adj; + struct filename *pathname; int i, type, prev; int err; + unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1564,12 +1833,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) BUG_ON(!current->mm); pathname = getname(specialfile); - err = PTR_ERR(pathname); if (IS_ERR(pathname)) - goto out; + return PTR_ERR(pathname); - victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); - putname(pathname); + victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; @@ -1605,64 +1872,75 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) /* just pick something that's safe... */ swap_list.next = swap_list.head; } + spin_lock(&p->lock); if (p->prio < 0) { for (i = p->next; i >= 0; i = swap_info[i]->next) swap_info[i]->prio = p->prio--; least_priority++; } - nr_swap_pages -= p->pages; + atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); spin_unlock(&swap_lock); - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); + set_current_oom_origin(); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + clear_current_oom_origin(); if (err) { - /* - * reading p->prio and p->swap_map outside the lock is - * safe here because only sys_swapon and sys_swapoff - * change them, and there can be no other sys_swapon or - * sys_swapoff for this swap_info_struct at this point. - */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + reinsert_swap_info(p); goto out_dput; } + flush_work(&p->discard_work); + destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + spin_lock(&p->lock); drain_mmlist(); /* wait for anyone still in scan_swap_map */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); spin_unlock(&swap_lock); schedule_timeout_uninterruptible(1); spin_lock(&swap_lock); + spin_lock(&p->lock); } swap_file = p->swap_file; + old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; - p->flags = 0; + cluster_info = p->cluster_info; + p->cluster_info = NULL; + frontswap_map = frontswap_map_get(p); + spin_unlock(&p->lock); spin_unlock(&swap_lock); + frontswap_invalidate_area(type); + frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); - /* Destroy swap account informatin */ + vfree(cluster_info); + vfree(frontswap_map); + /* Destroy swap account information */ swap_cgroup_swapoff(type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - set_blocksize(bdev, p->old_block_size); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); @@ -1670,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&inode->i_mutex); } filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); @@ -1677,6 +1965,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) out_dput: filp_close(victim, NULL); out: + putname(pathname); return err; } @@ -1761,7 +2050,7 @@ static int swap_show(struct seq_file *swap, void *v) len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", si->pages << (PAGE_SHIFT - 10), si->inuse_pages << (PAGE_SHIFT - 10), @@ -1856,6 +2145,7 @@ static struct swap_info_struct *alloc_swap_info(void) p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + spin_lock_init(&p->lock); return p; } @@ -1896,9 +2186,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, int i; unsigned long maxpages; unsigned long swapfilepages; + unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { - printk(KERN_ERR "Unable to find swap-space signature\n"); + pr_err("Unable to find swap-space signature\n"); return 0; } @@ -1912,9 +2203,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); return 0; } @@ -1924,26 +2214,28 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - - if (maxpages > swap_header->info.last_page) { - maxpages = swap_header->info.last_page + 1; + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; @@ -1954,8 +2246,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); + pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) @@ -1969,15 +2260,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p, static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, + struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { int i; unsigned int nr_good_pages; int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; nr_good_pages = maxpages - 1; /* omit header page */ + cluster_set_null(&p->free_cluster_head); + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); + for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) @@ -1985,11 +2284,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); } } + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); + if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); @@ -1998,17 +2311,55 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = p->pages; } if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); + pr_warn("Empty swap-file\n"); return -EINVAL; } + if (!cluster_info) + return nr_extents; + + for (i = 0; i < nr_clusters; i++) { + if (!cluster_count(&cluster_info[idx])) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, + idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } + } + idx++; + if (idx == nr_clusters) + idx = 0; + } return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; - char *name; + struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; int i; @@ -2019,6 +2370,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2032,13 +2385,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (IS_ERR(p)) return PTR_ERR(p); + INIT_WORK(&p->discard_work, swap_discard_work); + name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } - swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; @@ -2091,25 +2446,74 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + /* + * select a random position to start with to help wear leveling + * SSD + */ + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + + cluster_info = vzalloc(DIV_ROUND_UP(maxpages, + SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap; + } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } + } error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, - maxpages, &span); + cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); + + if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); - if (p->bdev) { - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; } mutex_lock(&swapon_mutex); @@ -2117,14 +2521,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", - p->pages<<(PAGE_SHIFT-10), name, p->prio, + pr_info("Adding %uk swap on %s. " + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -2135,6 +2542,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -2146,6 +2555,7 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { mutex_unlock(&inode->i_mutex); @@ -2177,7 +2587,7 @@ void si_swapinfo(struct sysinfo *val) if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += si->inuse_pages; } - val->freeswap = nr_swap_pages + nr_to_be_unused; + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } @@ -2210,11 +2620,21 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p = swap_info[type]; offset = swp_offset(entry); - spin_lock(&swap_lock); + spin_lock(&p->lock); if (unlikely(offset >= p->max)) goto unlock_out; count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; @@ -2245,12 +2665,12 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) p->swap_map[offset] = count | has_cache; unlock_out: - spin_unlock(&swap_lock); + spin_unlock(&p->lock); out: return err; bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } @@ -2292,6 +2712,31 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + BUG_ON(!PageSwapCache(page)); + return swap_info[swp_type(swap)]; +} + +/* + * out-of-line __page_file_ methods to avoid include hell. + */ +struct address_space *__page_file_mapping(struct page *page) +{ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return page_swap_info(page)->swap_file->f_mapping; +} +EXPORT_SYMBOL_GPL(__page_file_mapping); + +pgoff_t __page_file_index(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return swp_offset(swap); +} +EXPORT_SYMBOL_GPL(__page_file_index); + /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's @@ -2345,14 +2790,14 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - spin_unlock(&swap_lock); + spin_unlock(&si->lock); return -ENOMEM; } /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel pagetables: so it - * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; @@ -2393,7 +2838,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out: - spin_unlock(&swap_lock); + spin_unlock(&si->lock); outer: if (page) __free_page(page); diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd54..000000000000 --- a/mm/thrash.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * mm/thrash.c - * - * Copyright (C) 2004, Red Hat, Inc. - * Copyright (C) 2004, Rik van Riel <riel@redhat.com> - * Released under the GPL, see the file COPYING for details. - * - * Simple token based thrashing protection, using the algorithm - * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html - * - * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> - * Improved algorithm to pass token: - * Each task has a priority which is incremented if it contended - * for the token in an interval less than its previous attempt. - * If the token is acquired, that task's priority is boosted to prevent - * the token from bouncing around too often and to let the task make - * some progress in its execution. - */ - -#include <linux/jiffies.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/swap.h> -#include <linux/memcontrol.h> - -#include <trace/events/vmscan.h> - -#define TOKEN_AGING_INTERVAL (0xFF) - -static DEFINE_SPINLOCK(swap_token_lock); -struct mm_struct *swap_token_mm; -static struct mem_cgroup *swap_token_memcg; - -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - struct mem_cgroup *memcg; - - memcg = try_get_mem_cgroup_from_mm(mm); - if (memcg) - css_put(mem_cgroup_css(memcg)); - - return memcg; -} -#else -static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) -{ - return NULL; -} -#endif - -void grab_swap_token(struct mm_struct *mm) -{ - int current_interval; - unsigned int old_prio = mm->token_priority; - static unsigned int global_faults; - static unsigned int last_aging; - - global_faults++; - - current_interval = global_faults - mm->faultstamp; - - if (!spin_trylock(&swap_token_lock)) - return; - - /* First come first served */ - if (!swap_token_mm) - goto replace_token; - - /* - * Usually, we don't need priority aging because long interval faults - * makes priority decrease quickly. But there is one exception. If the - * token owner task is sleeping, it never make long interval faults. - * Thus, we need a priority aging mechanism instead. The requirements - * of priority aging are - * 1) An aging interval is reasonable enough long. Too short aging - * interval makes quick swap token lost and decrease performance. - * 2) The swap token owner task have to get priority aging even if - * it's under sleep. - */ - if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { - swap_token_mm->token_priority /= 2; - last_aging = global_faults; - } - - if (mm == swap_token_mm) { - mm->token_priority += 2; - goto update_priority; - } - - if (current_interval < mm->last_interval) - mm->token_priority++; - else { - if (likely(mm->token_priority > 0)) - mm->token_priority--; - } - - /* Check if we deserve the token */ - if (mm->token_priority > swap_token_mm->token_priority) - goto replace_token; - -update_priority: - trace_update_swap_token_priority(mm, old_prio, swap_token_mm); - -out: - mm->faultstamp = global_faults; - mm->last_interval = current_interval; - spin_unlock(&swap_token_lock); - return; - -replace_token: - mm->token_priority += 2; - trace_replace_swap_token(swap_token_mm, mm); - swap_token_mm = mm; - swap_token_memcg = swap_token_memcg_from_mm(mm); - last_aging = global_faults; - goto out; -} - -/* Called on process exit. */ -void __put_swap_token(struct mm_struct *mm) -{ - spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) { - trace_put_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); -} - -static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) -{ - if (!a) - return true; - if (!b) - return true; - if (a == b) - return true; - return false; -} - -void disable_swap_token(struct mem_cgroup *memcg) -{ - /* memcg reclaim don't disable unrelated mm token. */ - if (match_memcg(memcg, swap_token_memcg)) { - spin_lock(&swap_token_lock); - if (match_memcg(memcg, swap_token_memcg)) { - trace_disable_swap_token(swap_token_mm); - swap_token_mm = NULL; - swap_token_memcg = NULL; - } - spin_unlock(&swap_token_lock); - } -} diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..6a78c814bebf 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,11 +22,51 @@ #include <linux/cleancache.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * do_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -37,24 +77,18 @@ * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void do_invalidatepage(struct page *page, unsigned long offset) +void do_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - void (*invalidatepage)(struct page *, unsigned long); + void (*invalidatepage)(struct page *, unsigned int, unsigned int); + invalidatepage = page->mapping->a_ops->invalidatepage; #ifdef CONFIG_BLOCK if (!invalidatepage) invalidatepage = block_invalidatepage; #endif if (invalidatepage) - (*invalidatepage)(page, offset); -} - -static inline void truncate_partial_page(struct page *page, unsigned partial) -{ - zero_user_segment(page, partial, PAGE_CACHE_SIZE); - cleancache_invalidate_page(page->mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial); + (*invalidatepage)(page, offset, length); } /* @@ -103,11 +137,10 @@ truncate_complete_page(struct address_space *mapping, struct page *page) return -EIO; if (page_has_private(page)) - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); cancel_dirty_page(page, PAGE_CACHE_SIZE); - clear_page_mlock(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -132,7 +165,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -187,11 +219,11 @@ int invalidate_inode_page(struct page *page) * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate - * @lend: offset to which to truncate + * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between - * specified offsets (and zeroing out partial page - * (if lstart is not page aligned)). + * specified offsets (and zeroing out partial pages + * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass @@ -202,37 +234,67 @@ int invalidate_inode_page(struct page *page) * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. + * + * Note that since ->invalidatepage() accepts range to invalidate + * truncate_inode_pages_range is able to handle cases where lend + 1 is not + * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - struct pagevec pvec; - pgoff_t index; - pgoff_t end; - int i; + pgoff_t start; /* inclusive */ + pgoff_t end; /* exclusive */ + unsigned int partial_start; /* inclusive */ + unsigned int partial_end; /* exclusive */ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index; + int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); - end = (lend >> PAGE_CACHE_SHIFT); + /* Offsets within partial pages */ + partial_start = lstart & (PAGE_CACHE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + + /* + * 'start' and 'end' always covers the range of pages to be fully + * truncated. Partial pages are covered with 'partial_start' at the + * start of the range and 'partial_end' at the end of the range. + * Note that 'end' is exclusive while 'lend' is inclusive. + */ + start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (lend == -1) + /* + * lend == -1 indicates end-of-file so we have to set 'end' + * to the highest possible pgoff_t and since the type is + * unsigned we're using -1. + */ + end = -1; + else + end = (lend + 1) >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; - if (index > end) + index = indices[i]; + if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -243,33 +305,65 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); index++; } - if (partial) { + if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + /* Truncation within a single page */ + top = partial_end; + partial_end = 0; + } wait_on_page_writeback(page); - truncate_partial_page(page, partial); + zero_user_segment(page, partial_start, top); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, partial_start, + top - partial_start); unlock_page(page); page_cache_release(page); } } + if (partial_end) { + struct page *page = find_lock_page(mapping, end); + if (page) { + wait_on_page_writeback(page); + zero_user_segment(page, 0, partial_end); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, 0, + partial_end); + unlock_page(page); + page_cache_release(page); + } + } + /* + * If the truncation happened within a single page no pages + * will be released, just zeroed, so we can bail out now. + */ + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index > end) { + if (index == start && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -278,16 +372,22 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; - if (index > end) + index = indices[i]; + if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -315,6 +415,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) EXPORT_SYMBOL(truncate_inode_pages); /** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate @@ -330,32 +477,31 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; - /* - * Note: this function may get called on a shmem/tmpfs mapping: - * pagevec_lookup() might then return 0 prematurely (because it - * got a gangful of swap entries); but it's hardly worth worrying - * about - it can rarely have anything to free from such a mapping - * (most pages are dirty), and already skips over any difficulties. - */ - pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -369,6 +515,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -398,9 +545,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PageDirty(page)) goto failed; - clear_page_mlock(page); BUG_ON(page_has_private(page)); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -437,6 +583,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -447,17 +594,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -495,6 +648,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -523,7 +677,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @oldsize: old file size * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache @@ -536,7 +689,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) +void truncate_pagecache(struct inode *inode, loff_t newsize) { struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -570,64 +723,12 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { - loff_t oldsize; - - oldsize = inode->i_size; i_size_write(inode, newsize); - - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @newsize: file offset to start truncating - * - * This function is deprecated and truncate_setsize or truncate_pagecache - * should be used instead, together with filesystem specific block truncation. - */ -int vmtruncate(struct inode *inode, loff_t newsize) -{ - int error; - - error = inode_newsize_ok(inode, newsize); - if (error) - return error; - - truncate_setsize(inode, newsize); - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -} -EXPORT_SYMBOL(vmtruncate); - -int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) -{ - struct address_space *mapping = inode->i_mapping; - loff_t holebegin = round_up(lstart, PAGE_SIZE); - loff_t holelen = 1 + lend - holebegin; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - inode_dio_wait(inode); - unmap_mapping_range(mapping, holebegin, holelen, 1); - inode->i_op->truncate_range(inode, lstart, lend); - /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, holebegin, holelen, 1); - mutex_unlock(&inode->i_mutex); - - return 0; -} - -/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole @@ -649,10 +750,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are - * doing their own page rounding first; and truncate_inode_pages_range - * currently BUGs if lend is not pagealigned-1 (it handles partial - * page at start of hole, but not partial page at end of hole). Note - * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + * doing their own page rounding first. Note that unmap_mapping_range + * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* diff --git a/mm/util.c b/mm/util.c index ae962b31de88..d5ea733c5082 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,9 +1,17 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/security.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/mman.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> + #include <asm/uaccess.h> #include "internal.h" @@ -104,6 +112,25 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + /** * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. @@ -116,23 +143,11 @@ EXPORT_SYMBOL(memdup_user); */ void *__krealloc(const void *p, size_t new_size, gfp_t flags) { - void *ret; - size_t ks = 0; - if (unlikely(!new_size)) return ZERO_SIZE_PTR; - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; + return __do_krealloc(p, new_size, flags); - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; } EXPORT_SYMBOL(__krealloc); @@ -144,7 +159,7 @@ EXPORT_SYMBOL(__krealloc); * * The contents of the object pointed to are preserved up to the * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a * %NULL pointer, the object pointed to is freed. */ void *krealloc(const void *p, size_t new_size, gfp_t flags) @@ -156,7 +171,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, flags); if (ret && p != ret) kfree(p); @@ -285,7 +300,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif @@ -295,7 +309,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * If the architecture not support this function, simply return with no * page pinned */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, +int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { return 0; @@ -326,7 +340,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, +int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; @@ -341,6 +355,156 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + unsigned long ret; + struct mm_struct *mm = current->mm; + unsigned long populate; + + ret = security_mmap_file(file, prot, flag); + if (!ret) { + down_write(&mm->mmap_sem); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + } + return ret; +} + +unsigned long vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + if (unlikely(offset + PAGE_ALIGN(len) < offset)) + return -EINVAL; + if (unlikely(offset & ~PAGE_MASK)) + return -EINVAL; + + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +} +EXPORT_SYMBOL(vm_mmap); + +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + /* This happens if someone calls flush_dcache_page on slab page */ + if (unlikely(PageSlab(page))) + return NULL; + + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; +} + +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * Returns the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, mm->env_start, + buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..1037a3bab505 --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmacache.h> + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) + return vma; + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) + return vma; + } + + return NULL; +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b449..bf233b283319 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,10 +27,32 @@ #include <linux/pfn.h> #include <linux/kmemleak.h> #include <linux/atomic.h> +#include <linux/compiler.h> +#include <linux/llist.h> + #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void __vunmap(const void *, int); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *llnode = llist_del_all(&p->list); + while (llnode) { + void *p = llnode; + llnode = llist_next(llnode); + __vunmap(p, 1); + } +} + /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -249,19 +271,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 -struct vmap_area { - unsigned long va_start; - unsigned long va_end; - unsigned long flags; - struct rb_node rb_node; /* address sorted rbtree */ - struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ - struct vm_struct *vm; - struct rcu_head rcu_head; -}; - static DEFINE_SPINLOCK(vmap_area_lock); -static LIST_HEAD(vmap_area_list); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -282,7 +294,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; - else if (addr > va->va_start) + else if (addr >= va->va_end) n = n->rb_right; else return va; @@ -313,7 +325,7 @@ static void __insert_vmap_area(struct vmap_area *va) rb_link_node(&va->rb_node, parent, p); rb_insert_color(&va->rb_node, &vmap_area_root); - /* address-sort this list so it is usable like the vmlist */ + /* address-sort this list */ tmp = rb_prev(&va->rb_node); if (tmp) { struct vmap_area *prev; @@ -349,6 +361,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!va)) return ERR_PTR(-ENOMEM); + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + retry: spin_lock(&vmap_area_lock); /* @@ -378,12 +396,12 @@ nocache: addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; @@ -410,14 +428,14 @@ nocache: if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else + if (list_is_last(&first->list, &vmap_area_list)) goto found; + + first = list_entry(first->list.next, + struct vmap_area, list); } found: @@ -742,9 +760,7 @@ struct vmap_block_queue { struct vmap_block { spinlock_t lock; struct vmap_area *va; - struct vmap_block_queue *vbq; unsigned long free, dirty; - DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; @@ -810,7 +826,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; - bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -822,7 +837,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) radix_tree_preload_end(); vbq = &get_cpu_var(vmap_block_queue); - vb->vbq = vbq; spin_lock(&vbq->lock); list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); @@ -863,7 +877,6 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -881,11 +894,6 @@ static void purge_fragmented_blocks(int cpu) } } -static void purge_fragmented_blocks_thiscpu(void) -{ - purge_fragmented_blocks(smp_processor_id()); -} - static void purge_fragmented_blocks_allcpus(void) { int cpu; @@ -900,10 +908,17 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; unsigned long addr = 0; unsigned int order; - int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + if (WARN_ON(size == 0)) { + /* + * Allocating 0 bytes isn't what caller wants since + * get_order(0) returns funny result. Just warn and terminate + * early. + */ + return NULL; + } order = get_order(size); again: @@ -916,17 +931,7 @@ again: if (vb->free < 1UL << order) goto next; - i = bitmap_find_free_region(vb->alloc_map, - VMAP_BBMAP_BITS, order); - - if (i < 0) { - if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { - /* fragmented and no outstanding allocations */ - BUG_ON(vb->dirty != VMAP_BBMAP_BITS); - purge = 1; - } - goto next; - } + i = VMAP_BBMAP_BITS - vb->free; addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); @@ -942,9 +947,6 @@ next: spin_unlock(&vb->lock); } - if (purge) - purge_fragmented_blocks_thiscpu(); - put_cpu_var(vmap_block_queue); rcu_read_unlock(); @@ -1022,15 +1024,16 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i; + int i, j; spin_lock(&vb->lock); i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); - while (i < VMAP_BBMAP_BITS) { + if (i < VMAP_BBMAP_BITS) { unsigned long s, e; - int j; - j = find_next_zero_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); + + j = find_last_bit(vb->dirty_map, + VMAP_BBMAP_BITS); + j = j + 1; /* need exclusive index */ s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); @@ -1040,10 +1043,6 @@ void vm_unmap_aliases(void) start = s; if (e > end) end = e; - - i = j; - i = find_next_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); } spin_unlock(&vb->lock); } @@ -1086,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) @@ -1117,6 +1122,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +static struct vm_struct *vmlist __initdata; /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1176,18 +1182,23 @@ void __init vmalloc_init(void) for_each_possible_cpu(i) { struct vmap_block_queue *vbq; + struct vfree_deferred *p; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); } /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); - va->flags = tmp->flags | VM_VM_AREA; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; + va->vm = tmp; __insert_vmap_area(va); } @@ -1261,7 +1272,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; + unsigned long end = addr + get_vm_area_size(area); int err; err = vmap_page_range(addr, end, prot, *pages); @@ -1274,61 +1285,40 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); -/*** Old vmalloc interfaces ***/ -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) + unsigned long flags, const void *caller) { + spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; va->flags |= VM_VM_AREA; + spin_unlock(&vmap_area_lock); } -static void insert_vmalloc_vmlist(struct vm_struct *vm) -{ - struct vm_struct *tmp, **p; - - vm->flags &= ~VM_UNLIST; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= vm->addr) - break; - } - vm->next = *p; - *p = vm; - write_unlock(&vmlist_lock); -} - -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, void *caller) +static void clear_vm_uninitialized_flag(struct vm_struct *vm) { - setup_vmalloc_vm(vm, va, flags, caller); - insert_vmalloc_vmlist(vm); + /* + * Before removing VM_UNINITIALIZED, + * we should make sure that vm has proper values. + * Pair with smp_rmb() in show_numa_info(). + */ + smp_wmb(); + vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) { - int bit = fls(size); - - if (bit > IOREMAP_MAX_ORDER) - bit = IOREMAP_MAX_ORDER; - else if (bit < PAGE_SHIFT) - bit = PAGE_SHIFT; - - align = 1ul << bit; - } + if (flags & VM_IOREMAP) + align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) @@ -1349,17 +1339,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - /* - * When this function is called from __vmalloc_node_range, - * we do not add vm_struct to vmlist here to avoid - * accessing uninitialized members of vm_struct such as - * pages and nr_pages fields. They will be set later. - * To distinguish it from others, we use a VM_UNLIST flag. - */ - if (flags & VM_UNLIST) - setup_vmalloc_vm(area, va, flags, caller); - else - insert_vmalloc_vm(area, va, flags, caller); + setup_vmalloc_vm(area, va, flags, caller); return area; } @@ -1367,17 +1347,17 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, - __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, - void *caller) + const void *caller) { - return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, - caller); + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, caller); } /** @@ -1392,17 +1372,26 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - -1, GFP_KERNEL, __builtin_return_address(0)); + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, - void *caller) + const void *caller) { return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - -1, GFP_KERNEL, caller); + NUMA_NO_NODE, GFP_KERNEL, caller); } -static struct vm_struct *find_vm_area(const void *addr) +/** + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. + */ +struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1429,19 +1418,10 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; - if (!(vm->flags & VM_UNLIST)) { - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to - * this vm_struct before unmap. (address range - * confliction is maintained by vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); - } + spin_lock(&vmap_area_lock); + va->vm = NULL; + va->flags &= ~VM_VM_AREA; + spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); @@ -1459,10 +1439,9 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { - WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) return; - } area = remove_vm_area(addr); if (unlikely(!area)) { @@ -1493,7 +1472,7 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1502,15 +1481,26 @@ static void __vunmap(const void *addr, int deallocate_pages) * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is * NULL, no operation is performed. * - * Must not be called in interrupt context. + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) + * + * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { - BUG_ON(in_interrupt()); + BUG_ON(in_nmi()); kmemleak_free(addr); - __vunmap(addr, 1); + if (!addr) + return; + if (unlikely(in_interrupt())) { + struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); + } else + __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -1527,7 +1517,8 @@ void vunmap(const void *addr) { BUG_ON(in_interrupt()); might_sleep(); - __vunmap(addr, 0); + if (addr) + __vunmap(addr, 0); } EXPORT_SYMBOL(vunmap); @@ -1567,29 +1558,28 @@ EXPORT_SYMBOL(vmap); static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); + int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node) { const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; + nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, - PAGE_KERNEL, node, caller); + PAGE_KERNEL, node, area->caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; - area->caller = caller; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -1600,7 +1590,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; - if (node < 0) + if (node == NUMA_NO_NODE) page = alloc_page(tmp_mask); else page = alloc_pages_node(node, tmp_mask, order); @@ -1633,7 +1623,7 @@ fail: * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages - * @node: node to use for allocation or -1 + * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level @@ -1642,7 +1632,7 @@ fail: */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, int node, void *caller) + pgprot_t prot, int node, const void *caller) { struct vm_struct *area; void *addr; @@ -1652,27 +1642,28 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, start, end, node, gfp_mask, caller); if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) return NULL; /* - * In this function, newly allocated vm_struct is not added - * to vmlist at __get_vm_area_node(). so, it is added here. + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. + * Now, it is fully initialized, so remove this flag here. */ - insert_vmalloc_vmlist(area); + clear_vm_uninitialized_flag(area); /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. + * A ref_count = 2 is needed because vm_struct allocated in + * __get_vm_area_node() contains a reference to the virtual address of + * the vmalloc'ed block. */ - kmemleak_alloc(addr, real_size, 3, gfp_mask); + kmemleak_alloc(addr, real_size, 2, gfp_mask); return addr; @@ -1689,7 +1680,7 @@ fail: * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages - * @node: node to use for allocation or -1 + * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level @@ -1698,7 +1689,7 @@ fail: */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) + int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, node, caller); @@ -1706,7 +1697,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, 1, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -1729,7 +1720,8 @@ static inline void *__vmalloc_node_flags(unsigned long size, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); + return __vmalloc_node_flags(size, NUMA_NO_NODE, + GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -1745,7 +1737,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, -1, + return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1764,7 +1756,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc_node(size, SHMLBA, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL, -1, __builtin_return_address(0)); + PAGE_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -1829,7 +1822,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) @@ -1850,7 +1843,7 @@ void *vmalloc_exec(unsigned long size) void *vmalloc_32(unsigned long size) { return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1867,7 +1860,7 @@ void *vmalloc_32_user(unsigned long size) void *ret; ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - -1, __builtin_return_address(0)); + NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -1974,9 +1967,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. - * @buf should be kernel's buffer. Because this function uses KM_USER0, - * the caller should guarantee KM_USER0 is not used. + * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). @@ -1987,7 +1978,8 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) long vread(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; unsigned long n; @@ -1996,10 +1988,17 @@ long vread(char *buf, char *addr, unsigned long count) if ((unsigned long) addr + count < count) count = -(unsigned long) addr; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -2009,10 +2008,10 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) + if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); @@ -2021,7 +2020,7 @@ long vread(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (buf == buf_start) return 0; @@ -2050,9 +2049,7 @@ finished: * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. - * @buf should be kernel's buffer. Because this function uses KM_USER0, - * the caller should guarantee KM_USER0 is not used. + * vm_struct area, returns 0. @buf should be kernel's buffer. * * Note: In usual ops, vwrite() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). @@ -2062,7 +2059,8 @@ finished: long vwrite(char *buf, char *addr, unsigned long count) { - struct vm_struct *tmp; + struct vmap_area *va; + struct vm_struct *vm; char *vaddr; unsigned long n, buflen; int copied = 0; @@ -2072,10 +2070,17 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; buflen = count; - read_lock(&vmlist_lock); - for (tmp = vmlist; count && tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -2084,10 +2089,10 @@ long vwrite(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + tmp->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; - if (!(tmp->flags & VM_IOREMAP)) { + if (!(vm->flags & VM_IOREMAP)) { aligned_vwrite(buf, addr, n); copied++; } @@ -2096,49 +2101,50 @@ long vwrite(char *buf, char *addr, unsigned long count) count -= n; } finished: - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); if (!copied) return 0; return buflen; } /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ -int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, - unsigned long pgoff) +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long size) { struct vm_struct *area; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - if ((PAGE_SIZE-1) & (unsigned long)addr) + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; - area = find_vm_area(addr); + area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + if (kaddr + size > area->addr + area->size) return -EINVAL; - addr += pgoff << PAGE_SHIFT; do { - struct page *page = vmalloc_to_page(addr); + struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); @@ -2146,22 +2152,44 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, return ret; uaddr += PAGE_SIZE; - addr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } +EXPORT_SYMBOL(remap_vmalloc_range_partial); + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr + (pgoff << PAGE_SHIFT), + vma->vm_end - vma->vm_start); +} EXPORT_SYMBOL(remap_vmalloc_range); /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } @@ -2375,8 +2403,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); - vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); + vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); + vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); if (!vas || !vms) goto err_free2; @@ -2468,8 +2496,8 @@ found: /* insert all vm's */ for (area = 0; area < nr_vms; area++) - insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, - pcpu_get_vm_areas); + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); kfree(vas); return vms; @@ -2504,19 +2532,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmlist_lock) + __acquires(&vmap_area_lock) { loff_t n = *pos; - struct vm_struct *v; + struct vmap_area *va; - read_lock(&vmlist_lock); - v = vmlist; - while (n > 0 && v) { + spin_lock(&vmap_area_lock); + va = list_entry((&vmap_area_list)->next, typeof(*va), list); + while (n > 0 && &va->list != &vmap_area_list) { n--; - v = v->next; + va = list_entry(va->list.next, typeof(*va), list); } - if (!n) - return v; + if (!n && &va->list != &vmap_area_list) + return va; return NULL; @@ -2524,26 +2552,35 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vm_struct *v = p; + struct vmap_area *va = p, *next; ++*pos; - return v->next; + next = list_entry(va->list.next, typeof(*va), list); + if (&next->list != &vmap_area_list) + return next; + + return NULL; } static void s_stop(struct seq_file *m, void *p) - __releases(&vmlist_lock) + __releases(&vmap_area_lock) { - read_unlock(&vmlist_lock); + spin_unlock(&vmap_area_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) { - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; if (!counters) return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return; + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2557,9 +2594,19 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vm_struct *v = p; + struct vmap_area *va = p; + struct vm_struct *v; + + /* + * s_show can encounter race with remove_vm_area, !VM_VM_AREA on + * behalf of vmap area is being tear down or vm_map_ram allocation. + */ + if (!(va->flags & VM_VM_AREA)) + return 0; - seq_printf(m, "0x%p-0x%p %7ld", + v = va->vm; + + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); if (v->caller) @@ -2603,7 +2650,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); if (ptr == NULL) return -ENOMEM; @@ -2630,5 +2677,53 @@ static int __init proc_vmalloc_init(void) return 0; } module_init(proc_vmalloc_init); + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vmap_area *va; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + spin_lock(&vmap_area_lock); + + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } + + list_for_each_entry(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; + + /* + * Some archs keep another range for modules in vmalloc space + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + continue; + + vmi->used += (va->va_end - va->va_start); + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = va->va_end; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + +out: + spin_unlock(&vmap_area_lock); +} #endif diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..d4042e75f7c7 --- /dev/null +++ b/mm/vmpressure.c @@ -0,0 +1,380 @@ +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov <anton.vorontsov@linaro.org> + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/log2.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/eventfd.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/printk.h> +#include <linux/vmpressure.h> + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure; + + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure_event *ev; + enum vmpressure_levels level; + bool signalled = false; + + level = vmpressure_calc_level(scanned, reclaimed); + + mutex_lock(&vmpr->events_lock); + + list_for_each_entry(ev, &vmpr->events, node) { + if (level >= ev->level) { + eventfd_signal(ev->efd, 1); + signalled = true; + } + } + + mutex_unlock(&vmpr->events_lock); + + return signalled; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + if (!vmpr->scanned) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + vmpr->scanned = 0; + vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + do { + if (vmpressure_event(vmpr, scanned, reclaimed)) + break; + /* + * If not handled, propagate the event upward into the + * hierarchy. + */ + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + spin_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + spin_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, vmpressure_win, 0); +} + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @memcg: memcg that is interested in vmpressure notifications + * @eventfd: eventfd context to link notifications with + * @args: event arguments (used to set up a pressure level threshold) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a string that denotes pressure level + * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or + * "critical"). + * + * To be used as memcg event method. + */ +int vmpressure_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + int level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { + if (!strcmp(vmpressure_str_levels[level], args)) + break; + } + + if (level >= VMPRESSURE_NUM_LEVELS) + return -EINVAL; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->efd = eventfd; + ev->level = level; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + + return 0; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @memcg: memcg handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * To be used as memcg event method. + */ +void vmpressure_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + spin_lock_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} + +/** + * vmpressure_cleanup() - shuts down vmpressure control structure + * @vmpr: Structure to be cleaned up + * + * This function should be called before the structure in which it is + * embedded is cleaned up. + */ +void vmpressure_cleanup(struct vmpressure *vmpr) +{ + /* + * Make sure there is no pending work before eventfd infrastructure + * goes away. + */ + flush_work(&vmpr->work); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index 33dc256033b5..32c661d66a45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> @@ -47,30 +48,13 @@ #include <asm/div64.h> #include <linux/swapops.h> +#include <linux/balloon_compaction.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -/* - * reclaim_mode determines how the inactive list is shrunk - * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages - * RECLAIM_MODE_ASYNC: Do not block - * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference - * page from the LRU and reclaim all pages within a - * naturally aligned range - * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of - * order-0 pages and then compact the zone - */ -typedef unsigned __bitwise__ reclaim_mode_t; -#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) -#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) -#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) -#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) -#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -96,11 +80,8 @@ struct scan_control { int order; - /* - * Intend to reclaim enough continuous memory rather than reclaim - * enough amount of memory. i.e, mode for high order allocation. - */ - reclaim_mode_t reclaim_mode; + /* Scan (total_size >> priority) pages at once */ + int priority; /* * The memory cgroup that hit its limit and as a result is the @@ -115,11 +96,6 @@ struct scan_control { nodemask_t *nodemask; }; -struct mem_cgroup_zone { - struct mem_cgroup *mem_cgroup; - struct zone *zone; -}; - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -154,63 +130,76 @@ struct mem_cgroup_zone { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -long vm_total_pages; /* The total number of pages which the VM controls */ +unsigned long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } - -static bool scanning_global_lru(struct mem_cgroup_zone *mz) -{ - return !mz->mem_cgroup; -} #else static bool global_reclaim(struct scan_control *sc) { return true; } +#endif -static bool scanning_global_lru(struct mem_cgroup_zone *mz) +static unsigned long zone_reclaimable_pages(struct zone *zone) { - return true; + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } -#endif -static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) +bool zone_reclaimable(struct zone *zone) { - if (!scanning_global_lru(mz)) - return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); - - return &mz->zone->reclaim_stat; + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } -static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, - enum lru_list lru) +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { - if (!scanning_global_lru(mz)) - return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, - zone_to_nid(mz->zone), - zone_idx(mz->zone), - BIT(lru)); + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); - return zone_page_state(mz->zone, NR_LRU_BASE + lru); + return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); } - /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - atomic_long_set(&shrinker->nr_in_batch, 0); + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -222,18 +211,123 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); -static inline int do_shrinker_shrink(struct shrinker *shrinker, - struct shrink_control *sc, - unsigned long nr_to_scan) -{ - sc->nr_to_scan = nr_to_scan; - return (*shrinker->shrink)(shrinker, sc); +#define SHRINK_BATCH 128 + +static unsigned long +shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, + unsigned long nr_pages_scanned, unsigned long lru_pages) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= freeable; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > freeable * 2) + total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, + freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + return freed; } -#define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches * @@ -253,148 +347,46 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(struct shrink_control *shrink, +unsigned long shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_pages_scanned, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; if (nr_pages_scanned == 0) nr_pages_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - long total_scan; - long max_pass; - int shrink_ret = 0; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { + shrinkctl->nid = 0; + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); continue; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); - - total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - total_scan += delta; - if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); - total_scan = max_pass; } - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; - - trace_mm_shrink_slab_start(shrinker, shrink, nr, - nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); - - while (total_scan >= batch_size) { - int nr_before; - - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { + if (node_online(shrinkctl->nid)) + freed += shrink_slab_node(shrinkctl, shrinker, + nr_pages_scanned, lru_pages); - cond_resched(); } - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. - */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); - else - new_nr = atomic_long_read(&shrinker->nr_in_batch); - - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out: cond_resched(); - return ret; -} - -static void set_reclaim_mode(int priority, struct scan_control *sc, - bool sync) -{ - reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; - - /* - * Initially assume we are entering either lumpy reclaim or - * reclaim/compaction.Depending on the order, we will either set the - * sync mode or just reclaim order-0 pages later. - */ - if (COMPACTION_BUILD) - sc->reclaim_mode = RECLAIM_MODE_COMPACTION; - else - sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; - - /* - * Avoid using lumpy reclaim or reclaim/compaction if possible by - * restricting when its set to either costly allocations or when - * under memory pressure - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->reclaim_mode |= syncmode; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->reclaim_mode |= syncmode; - else - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; -} - -static void reset_reclaim_mode(struct scan_control *sc) -{ - sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; + return freed; } static inline int is_page_cache_freeable(struct page *page) @@ -416,10 +408,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, return 1; if (bdi == current->backing_dev_info) return 1; - - /* lumpy reclaim for hugepage often need a lot of write */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - return 1; return 0; } @@ -523,8 +511,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->reclaim_mode)); + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -536,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -582,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swapcache_free(swap, page); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -608,7 +609,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -631,30 +632,29 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; - int active = !!TestClearPageActive(page); + bool is_unevictable; int was_unevictable = PageUnevictable(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + is_unevictable = false; + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -674,7 +674,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -685,9 +685,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ @@ -701,19 +701,15 @@ enum page_references { }; static enum page_references page_check_references(struct page *page, - struct mem_cgroup_zone *mz, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; - referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); referenced_page = TestClearPageReferenced(page); - /* Lumpy reclaim - ignore references */ - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - return PAGEREF_RECLAIM; - /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. @@ -722,7 +718,7 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageAnon(page)) + if (PageSwapBacked(page)) return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table @@ -759,31 +755,68 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + /* * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct mem_cgroup_zone *mz, + struct zone *zone, struct scan_control *sc, - int priority, + enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, - unsigned long *ret_nr_writeback) + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, + unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, + bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); + mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { - enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -793,12 +826,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != mz->zone); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; - if (unlikely(!page_evictable(page, NULL))) + if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) @@ -811,24 +844,103 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - nr_writeback++; - /* - * Synchronous reclaim cannot queue pages for - * writeback due to the possibility of stack overflow - * but if it encounters a page under writeback, wait - * for the IO to complete. - */ - if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && - may_enter_fs) + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); + nr_writeback++; + + goto keep_locked; + + /* Case 3 above */ + } else { wait_on_page_writeback(page); - else { - unlock_page(page); - goto keep_lumpy; } } - references = page_check_references(page, mz, sc); + if (!force_reclaim) + references = page_check_references(page, sc); + switch (references) { case PAGEREF_ACTIVATE: goto activate_locked; @@ -846,19 +958,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page)) + if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, TTU_UNMAP)) { + switch (try_to_unmap(page, ttu_flags)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -871,15 +984,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - nr_dirty++; - /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && - (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { + (!current_is_kswapd() || + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -902,13 +1014,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) - goto keep_lumpy; + goto keep; if (PageDirty(page)) goto keep; @@ -968,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -994,43 +1105,63 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - reset_reclaim_mode(sc); continue; activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: - reset_reclaim_mode(sc); -keep_lumpy: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(mz->zone, ZONE_CONGESTED); - free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + mem_cgroup_uncharge_end(); *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1041,35 +1172,16 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode) { - bool all_lru_mode; int ret = -EINVAL; /* Only take pages on the LRU. */ if (!PageLRU(page)) return ret; - all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == - (ISOLATE_ACTIVE|ISOLATE_INACTIVE); - - /* - * When checking the active state, we need to be sure we are - * dealing with comparible boolean values. Take the logical not - * of each. - */ - if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) - return ret; - - if (!all_lru_mode && !!page_is_file_cache(page) != file) - return ret; - - /* - * When this function is being called for lumpy reclaim, we - * initially look into all LRU pages, active, inactive and - * unevictable; only give shrink_page_list evictable pages. - */ - if (PageUnevictable(page)) + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; ret = -EBUSY; @@ -1135,54 +1247,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) * Appropriate locks must be held before calling this function. * * @nr_to_scan: The number of pages to look through on the list. - * @mz: The mem_cgroup_zone to pull pages from. + * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @mode: One of the LRU isolation modes - * @active: True [1] if isolating active pages - * @file: True [1] if isolating file [!anon] pages + * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, struct list_head *dst, + struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, - isolate_mode_t mode, int active, int file) + isolate_mode_t mode, enum lru_list lru) { - struct lruvec *lruvec; - struct list_head *src; + struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long nr_lumpy_taken = 0; - unsigned long nr_lumpy_dirty = 0; - unsigned long nr_lumpy_failed = 0; unsigned long scan; - int lru = LRU_BASE; - - lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup); - if (active) - lru += LRU_ACTIVE; - if (file) - lru += LRU_FILE; - src = &lruvec->lists[lru]; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct page *page; - unsigned long pfn; - unsigned long end_pfn; - unsigned long page_pfn; - int zone_id; + int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); - switch (__isolate_lru_page(page, mode, file)) { + switch (__isolate_lru_page(page, mode)) { case 0: - mem_cgroup_lru_del(page); + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_move(&page->lru, dst); - nr_taken += hpage_nr_pages(page); + nr_taken += nr_pages; break; case -EBUSY: @@ -1193,93 +1290,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, default: BUG(); } - - if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) - continue; - - /* - * Attempt to take all pages in the order aligned region - * surrounding the tag page. Only take those pages of - * the same active state as that tag page. We may safely - * round the target page pfn down to the requested order - * as the mem_map is guaranteed valid out to MAX_ORDER, - * where that page is in a different zone we will detect - * it from its zone id and abort this block scan. - */ - zone_id = page_zone_id(page); - page_pfn = page_to_pfn(page); - pfn = page_pfn & ~((1 << sc->order) - 1); - end_pfn = pfn + (1 << sc->order); - for (; pfn < end_pfn; pfn++) { - struct page *cursor_page; - - /* The target page is in the block, ignore it. */ - if (unlikely(pfn == page_pfn)) - continue; - - /* Avoid holes within the zone. */ - if (unlikely(!pfn_valid_within(pfn))) - break; - - cursor_page = pfn_to_page(pfn); - - /* Check that we have not crossed a zone boundary. */ - if (unlikely(page_zone_id(cursor_page) != zone_id)) - break; - - /* - * If we don't have enough swap space, reclaiming of - * anon page which don't already have a swap slot is - * pointless. - */ - if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && - !PageSwapCache(cursor_page)) - break; - - if (__isolate_lru_page(cursor_page, mode, file) == 0) { - unsigned int isolated_pages; - - mem_cgroup_lru_del(cursor_page); - list_move(&cursor_page->lru, dst); - isolated_pages = hpage_nr_pages(cursor_page); - nr_taken += isolated_pages; - nr_lumpy_taken += isolated_pages; - if (PageDirty(cursor_page)) - nr_lumpy_dirty += isolated_pages; - scan++; - pfn += isolated_pages - 1; - } else { - /* - * Check if the page is freed already. - * - * We can't use page_count() as that - * requires compound_head and we don't - * have a pin on the page here. If a - * page is tail, we may or may not - * have isolated the head, so assume - * it's not free, it'd be tricky to - * track the head status without a - * page pin. - */ - if (!PageTail(cursor_page) && - !atomic_read(&cursor_page->_count)) - continue; - break; - } - } - - /* If we break out of the loop above, lumpy reclaim failed */ - if (pfn < end_pfn) - nr_lumpy_failed++; } *nr_scanned = scan; - - trace_mm_vmscan_lru_isolate(sc->order, - nr_to_scan, scan, - nr_taken, - nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, - mode, file); + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); return nr_taken; } @@ -1312,19 +1327,20 @@ int isolate_lru_page(struct page *page) { int ret = -EBUSY; - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); if (PageLRU(page)) { struct zone *zone = page_zone(page); + struct lruvec *lruvec; spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); if (PageLRU(page)) { int lru = page_lru(page); - ret = 0; get_page(page); ClearPageLRU(page); - - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); + ret = 0; } spin_unlock_irq(&zone->lru_lock); } @@ -1332,7 +1348,11 @@ int isolate_lru_page(struct page *page) } /* - * Are there way too many processes in the direct reclaim path already? + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, struct scan_control *sc) @@ -1353,15 +1373,22 @@ static int too_many_isolated(struct zone *zone, int file, isolated = zone_page_state(zone, NR_ISOLATED_ANON); } + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + return isolated > inactive; } static noinline_for_stack void -putback_inactive_pages(struct mem_cgroup_zone *mz, - struct list_head *page_list) +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); - struct zone *zone = mz->zone; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct zone *zone = lruvec_zone(lruvec); LIST_HEAD(pages_to_free); /* @@ -1371,17 +1398,21 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, struct page *page = lru_to_page(page_list); int lru; - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); putback_lru_page(page); spin_lock_irq(&zone->lru_lock); continue; } + + lruvec = mem_cgroup_page_lruvec(page, zone); + SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); + add_page_to_lru_list(page, lruvec, lru); + if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); @@ -1390,7 +1421,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1407,112 +1438,27 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, list_splice(&pages_to_free, page_list); } -static noinline_for_stack void -update_isolated_counts(struct mem_cgroup_zone *mz, - struct list_head *page_list, - unsigned long *nr_anon, - unsigned long *nr_file) -{ - struct zone *zone = mz->zone; - unsigned int count[NR_LRU_LISTS] = { 0, }; - unsigned long nr_active = 0; - struct page *page; - int lru; - - /* - * Count pages and clear active flags - */ - list_for_each_entry(page, page_list, lru) { - int numpages = hpage_nr_pages(page); - lru = page_lru_base_type(page); - if (PageActive(page)) { - lru += LRU_ACTIVE; - ClearPageActive(page); - nr_active += numpages; - } - count[lru] += numpages; - } - - preempt_disable(); - __count_vm_events(PGDEACTIVATE, nr_active); - - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); - - *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - - __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); - preempt_enable(); -} - -/* - * Returns true if a direct reclaim should wait on pages under writeback. - * - * If we are direct reclaiming for contiguous pages and we do not reclaim - * everything in the list, try again and wait for writeback IO to complete. - * This will stall high-order allocations noticeably. Only do that when really - * need to free the pages under high memory pressure. - */ -static inline bool should_reclaim_stall(unsigned long nr_taken, - unsigned long nr_freed, - int priority, - struct scan_control *sc) -{ - int lumpy_stall_priority; - - /* kswapd should not stall on sync IO */ - if (current_is_kswapd()) - return false; - - /* Only stall on lumpy reclaim */ - if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) - return false; - - /* If we have reclaimed everything on the isolated list, no stall */ - if (nr_freed == nr_taken) - return false; - - /* - * For high-order allocations, there are two stall thresholds. - * High-cost allocations stall immediately where as lower - * order allocations such as stacks require the scanning - * priority to be much higher before stalling. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_stall_priority = DEF_PRIORITY; - else - lumpy_stall_priority = DEF_PRIORITY / 3; - - return priority <= lumpy_stall_priority; -} - /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, - struct scan_control *sc, int priority, int file) +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) { LIST_HEAD(page_list); unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; - isolate_mode_t isolate_mode = ISOLATE_INACTIVE; - struct zone *zone = mz->zone; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + unsigned long nr_immediate = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1522,10 +1468,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, return SWAP_CLUSTER_MAX; } - set_reclaim_mode(priority, sc, false); - if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) - isolate_mode |= ISOLATE_ACTIVE; - lru_add_drain(); if (!sc->may_unmap) @@ -1535,38 +1477,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, - sc, isolate_mode, 0, file); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + &nr_scanned, sc, isolate_mode, lru); + + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + if (global_reclaim(sc)) { zone->pages_scanned += nr_scanned; if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, - nr_scanned); + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, - nr_scanned); + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) return 0; - update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); - - nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, - &nr_dirty, &nr_writeback); - - /* Check if we should syncronously wait for writeback */ - if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, mz, sc, - priority, &nr_dirty, &nr_writeback); - } + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[0] += nr_anon; - reclaim_stat->recent_scanned[1] += nr_file; + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { if (current_is_kswapd()) @@ -1577,10 +1513,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, nr_reclaimed); } - putback_inactive_pages(mz, &page_list); + putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1596,27 +1531,58 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. + */ + if (nr_writeback && nr_writeback == nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); + + /* + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim + */ + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. It will forcibly stall in the + * next check. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * In addition, if kswapd scans pages marked marked for + * immediate reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_unqueued_dirty == nr_taken || nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. */ - if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) + if (!sc->hibernation_mode && !current_is_kswapd()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, - priority, - trace_shrink_flags(file, sc->reclaim_mode)); + sc->priority, + trace_shrink_flags(file)); return nr_reclaimed; } @@ -1638,30 +1604,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, * But we had to alter page->flags anyway. */ -static void move_active_pages_to_lru(struct zone *zone, +static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, struct list_head *pages_to_free, enum lru_list lru) { + struct zone *zone = lruvec_zone(lruvec); unsigned long pgmoved = 0; struct page *page; + int nr_pages; while (!list_empty(list)) { - struct lruvec *lruvec; - page = lru_to_page(list); + lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); - lruvec = mem_cgroup_lru_add_list(zone, page, lru); + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); - pgmoved += hpage_nr_pages(page); + pgmoved += nr_pages; if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); - del_page_from_lru_list(zone, page, lru); + del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1677,9 +1645,9 @@ static void move_active_pages_to_lru(struct zone *zone, } static void shrink_active_list(unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, + struct lruvec *lruvec, struct scan_control *sc, - int priority, int file) + enum lru_list lru) { unsigned long nr_taken; unsigned long nr_scanned; @@ -1688,15 +1656,14 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned long nr_rotated = 0; - isolate_mode_t isolate_mode = ISOLATE_ACTIVE; - struct zone *zone = mz->zone; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); lru_add_drain(); - reset_reclaim_mode(sc); - if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; if (!sc->may_writepage) @@ -1704,18 +1671,15 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&zone->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, - isolate_mode, 1, file); + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) zone->pages_scanned += nr_scanned; reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, nr_scanned); - if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); - else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1724,7 +1688,7 @@ static void shrink_active_list(unsigned long nr_to_scan, page = lru_to_page(&l_hold); list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { + if (unlikely(!page_evictable(page))) { putback_lru_page(page); continue; } @@ -1737,7 +1701,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } } - if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and @@ -1770,10 +1735,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - move_active_pages_to_lru(zone, &l_active, &l_hold, - LRU_ACTIVE + file * LRU_FILE); - move_active_pages_to_lru(zone, &l_inactive, &l_hold, - LRU_BASE + file * LRU_FILE); + move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); @@ -1796,13 +1759,12 @@ static int inactive_anon_is_low_global(struct zone *zone) /** * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @zone: zone to check - * @sc: scan control of this context + * @lruvec: LRU vector to check * * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct mem_cgroup_zone *mz) +static int inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation @@ -1811,32 +1773,21 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) if (!total_swap_pages) return 0; - if (!scanning_global_lru(mz)) - return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, - mz->zone); + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_anon_is_low(lruvec); - return inactive_anon_is_low_global(mz->zone); + return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) +static inline int inactive_anon_is_low(struct lruvec *lruvec) { return 0; } #endif -static int inactive_file_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_FILE); - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - - return (active > inactive); -} - /** * inactive_file_is_low - check if file pages need to be deactivated - * @mz: memory cgroup and zone to check + * @lruvec: LRU vector to check * * When the system is doing streaming IO, memory pressure here * ensures that active file pages get deactivated, until more @@ -1848,65 +1799,73 @@ static int inactive_file_is_low_global(struct zone *zone) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct mem_cgroup_zone *mz) +static int inactive_file_is_low(struct lruvec *lruvec) { - if (!scanning_global_lru(mz)) - return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, - mz->zone); + unsigned long inactive; + unsigned long active; - return inactive_file_is_low_global(mz->zone); + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + + return active > inactive; } -static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { - if (file) - return inactive_file_is_low(mz); + if (is_file_lru(lru)) + return inactive_file_is_low(lruvec); else - return inactive_anon_is_low(mz); + return inactive_anon_is_low(lruvec); } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct mem_cgroup_zone *mz, - struct scan_control *sc, int priority) + struct lruvec *lruvec, struct scan_control *sc) { - int file = is_file_lru(lru); - if (is_active_lru(lru)) { - if (inactive_list_is_low(mz, file)) - shrink_active_list(nr_to_scan, mz, sc, priority, file); + if (inactive_list_is_low(lruvec, lru)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } - return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct mem_cgroup_zone *mz, - struct scan_control *sc) +static int vmscan_swappiness(struct scan_control *sc) { if (global_reclaim(sc)) return vm_swappiness; - return mem_cgroup_swappiness(mz->mem_cgroup); + return mem_cgroup_swappiness(sc->target_mem_cgroup); } +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * nr[0] = anon pages to scan; nr[1] = file pages to scan + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, - unsigned long *nr, int priority) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { - unsigned long anon, file, free; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file; + bool force_scan = false; unsigned long ap, fp; - struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); - u64 fraction[2], denominator; enum lru_list lru; - int noswap = 0; - bool force_scan = false; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1918,43 +1877,79 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && mz->zone->all_unreclaimable) + if (current_is_kswapd() && !zone_reclaimable(zone)) force_scan = true; if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - fraction[0] = 0; - fraction[1] = 1; - denominator = 1; + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && vmscan_swappiness(sc)) { + scan_balance = SCAN_EQUAL; goto out; } - anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ if (global_reclaim(sc)) { - free = zone_page_state(mz->zone, NR_FREE_PAGES); - /* If we have very few page cache pages, - force-scan anon pages. */ - if (unlikely(file + free <= high_wmark_pages(mz->zone))) { - fraction[0] = 1; - fraction[1] = 0; - denominator = 1; + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + + if (unlikely(file + free <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; goto out; } } /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(mz, sc); - file_prio = 200 - vmscan_swappiness(mz, sc); + anon_prio = vmscan_swappiness(sc); + file_prio = 200 - anon_prio; /* * OK, so we have swap space and a fair amount of page cache @@ -1967,7 +1962,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * * anon in [0], file in [1] */ - spin_lock_irq(&mz->zone->lru_lock); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -1983,12 +1978,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&mz->zone->lru_lock); + spin_unlock_irq(&zone->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -1996,27 +1991,162 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); + unsigned long size; unsigned long scan; - scan = zone_nr_lru_pages(mz, lru); - if (priority || noswap) { - scan >>= priority; - if (!scan && force_scan) - scan = SWAP_CLUSTER_MAX; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ scan = div64_u64(scan * fraction[file], denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); } nr[lru] = scan; } } /* - * Reclaim/compaction depends on a number of pages being freed. To avoid - * disruption to the system, a small number of order-0 pages continue to be - * rotated and reclaimed in the normal fashion. However, by the time we get - * back to the allocator and call try_to_compact_zone(), we ensure that - * there are enough free pages for it to be likely successful + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted = false; + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. + */ + if (global_reclaim(sc) && !current_is_kswapd()) + break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + throttle_vm_writeout(sc->gfp_mask); +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + +/* + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. + */ +static inline bool should_continue_reclaim(struct zone *zone, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) @@ -2025,7 +2155,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + if (!in_reclaim_compaction(sc)) return false; /* Consider stopping depending on scan and reclaim activity */ @@ -2056,15 +2186,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); - if (nr_swap_pages > 0) - inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(mz->zone, sc->order)) { + switch (compaction_suitable(zone, sc->order)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2073,100 +2203,53 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, } } -/* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, - struct scan_control *sc) +static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long nr[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list lru; unsigned long nr_reclaimed, nr_scanned; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct blk_plug plug; -restart: - nr_reclaimed = 0; - nr_scanned = sc->nr_scanned; - get_scan_count(mz, sc, nr, priority); + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + struct mem_cgroup *memcg; - blk_start_plug(&plug); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - for_each_evictable_lru(lru) { - if (nr[lru]) { - nr_to_scan = min_t(unsigned long, - nr[lru], SWAP_CLUSTER_MAX); - nr[lru] -= nr_to_scan; + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - nr_reclaimed += shrink_list(lru, nr_to_scan, - mz, sc, priority); - } - } - /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. - */ - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) - break; - } - blk_finish_plug(&plug); - sc->nr_reclaimed += nr_reclaimed; - - /* - * Even if we did not try to evict anon pages at all, we want to - * rebalance the anon lru active/inactive ratio. - */ - if (inactive_anon_is_low(mz)) - shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + struct lruvec *lruvec; - /* reclaim/compaction might need reclaim to continue */ - if (should_continue_reclaim(mz, nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)) - goto restart; + lruvec = mem_cgroup_zone_lruvec(zone, memcg); - throttle_vm_writeout(sc->gfp_mask); -} + shrink_lruvec(lruvec, sc); -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) -{ - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = priority, - }; - struct mem_cgroup *memcg; + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; + } + memcg = mem_cgroup_iter(root, memcg, &reclaim); + } while (memcg); - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); - shrink_mem_cgroup_zone(priority, &mz, sc); - /* - * Limit reclaim has historically picked one memcg and - * scanned it with decreasing priority levels until - * nr_to_reclaim had been reclaimed. This priority - * cycle is thus over after a single memcg. - * - * Direct reclaim and kswapd, on the other hand, have - * to scan all memory cgroups to fulfill the overall - * scan target for the zone. - */ - if (!global_reclaim(sc)) { - mem_cgroup_iter_break(root, memcg); - break; - } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); } /* Returns true if compaction should go ahead for a high-order request */ @@ -2186,7 +2269,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * a reasonable chance of completing and allocating the page */ balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2226,23 +2309,32 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * the caller that it should consider retrying the allocation instead of * further reclaim. */ -static bool shrink_zones(int priority, struct zonelist *zonelist, - struct scan_control *sc) +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; + nodes_clear(shrink.nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2254,9 +2346,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (COMPACTION_BUILD) { + if (IS_ENABLED(CONFIG_COMPACTION)) { /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2266,7 +2363,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticeable problem, like transparent huge * page allocations. */ - if (compaction_ready(zone, sc)) { + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { aborted_reclaim = true; continue; } @@ -2286,15 +2384,30 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, /* need some check for avoid more shrink_zone() */ } - shrink_zone(priority, zone, sc); + shrink_zone(zone, sc); } - return aborted_reclaim; -} + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return aborted_reclaim; } /* All zones in zonelist are unreclaimable? */ @@ -2310,7 +2423,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (!zone->all_unreclaimable) + if (zone_reclaimable(zone)) return false; } @@ -2334,14 +2447,9 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { - int priority; unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; bool aborted_reclaim; @@ -2350,37 +2458,24 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (global_reclaim(sc)) count_vm_event(ALLOCSTALL); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; - if (!priority) - disable_swap_token(sc->target_mem_cgroup); - aborted_reclaim = shrink_zones(priority, zonelist, sc); + aborted_reclaim = shrink_zones(zonelist, sc); - /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + + /* * Try to write back as many pages as we just scanned. This * tends to cause slow streaming writers to write data to the * disk smoothly, at the dirtying rate, which is nice. But @@ -2393,18 +2488,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } - } + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); @@ -2431,36 +2515,137 @@ out: return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zone *zone; + int high_zoneidx = gfp_zone(gfp_mask); + pg_data_t *pgdat; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; + + /* Check if the pfmemalloc reserves are ok */ + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + + goto check_pending; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = gfp_mask, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .order = order, + .priority = DEF_PRIORITY, .target_mem_cgroup = NULL, .nodemask = nodemask, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; + + /* + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. + */ + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); return nr_reclaimed; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, @@ -2474,17 +2659,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, .order = 0, + .priority = 0, .target_mem_cgroup = memcg, }; - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); - trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, sc.gfp_mask); @@ -2495,7 +2678,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_mem_cgroup_zone(0, &mz, &sc); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2516,14 +2699,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, + .priority = DEF_PRIORITY, .target_mem_cgroup = memcg, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't @@ -2538,7 +2719,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2546,8 +2727,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc, - int priority) +static void age_active_anon(struct zone *zone, struct scan_control *sc) { struct mem_cgroup *memcg; @@ -2556,26 +2736,41 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup_zone mz = { - .mem_cgroup = memcg, - .zone = zone, - }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - if (inactive_anon_is_low(&mz)) - shrink_active_list(SWAP_CLUSTER_MAX, &mz, - sc, priority, 0); + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (IS_ENABLED(CONFIG_COMPACTION) && order && + !compaction_suitable(zone, order)) + return false; + + return true; +} + /* - * pgdat_balanced is used when checking if a node is balanced for high-order - * allocations. Only zones that meet watermarks and are in a zone allowed - * by the callers classzone_idx are added to balanced_pages. The total of - * balanced pages must be at least 25% of the zones allowed by classzone_idx - * for the node to be considered balanced. Forcing all zones to be balanced - * for high orders can cause excessive reclaim when there are imbalanced zones. + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. * The choice of 25% is due to * o a 16M DMA zone that is balanced will not balance a zone on any * reasonable sized machine @@ -2585,31 +2780,12 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc, * Similarly, on x86-64 the Normal zone would need to be at least 1G * to balance a node on its own. These seemed like reasonable ratios. */ -static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, - int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long present_pages = 0; + unsigned long managed_pages = 0; + unsigned long balanced_pages = 0; int i; - for (i = 0; i <= classzone_idx; i++) - present_pages += pgdat->node_zones[i].present_pages; - - /* A special case here: if zone has no page, we think it's balanced */ - return balanced_pages >= (present_pages >> 2); -} - -/* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) -{ - int i; - unsigned long balanced = 0; - bool all_zones_ok = true; - - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return true; - /* Check the watermark levels */ for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2617,33 +2793,143 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, if (!populated_zone(zone)) continue; + managed_pages += zone->managed_pages; + /* + * A special case here: + * * balance_pgdat() skips over all_unreclaimable after * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well if kswapd - * is to sleep + * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { - balanced += zone->present_pages; + if (!zone_reclaimable(zone)) { + balanced_pages += zone->managed_pages; continue; } - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - i, 0)) - all_zones_ok = false; - else - balanced += zone->present_pages; + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->managed_pages; + else if (!order) + return false; } - /* - * For high-order requests, the balanced zones must contain at least - * 25% of the nodes pages for kswapd to sleep. For order-0, all zones - * must be balanced - */ if (order) - return !pgdat_balanced(pgdat, balanced, classzone_idx); + return balanced_pages >= (managed_pages >> 2); else - return !all_zones_ok; + return true; +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) +{ + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return false; + + /* + * There is a potential race between when kswapd checks its watermarks + * and a process gets throttled. There is also a potential race if + * processes get throttled, kswapd wakes, a large process exits therby + * balancing the zones that causes kswapd to miss a wakeup. If kswapd + * is going to sleep, no process should be sleeping on pfmemalloc_wait + * so wake them now if necessary. If necessary, processes will wake + * kswapd and get throttled again + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { + wake_up(&pgdat->pfmemalloc_wait); + return false; + } + + return pgdat_balanced(pgdat, order, classzone_idx); +} + +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + + reclaim_state->reclaimed_slab = 0; + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (zone_reclaimable(zone) && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; } /* @@ -2670,46 +2956,28 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - int all_zones_ok; - unsigned long balanced; - int priority; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long total_scanned; - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; -loop_again: - total_scanned = 0; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + do { unsigned long lru_pages = 0; - int has_under_min_watermark_zone = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); - /* The swap token gets in the way of swapout... */ - if (!priority) - disable_swap_token(NULL); - - all_zones_ok = 1; - balanced = 0; + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2721,14 +2989,15 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - age_active_anon(zone, &sc, priority); + age_active_anon(zone, &sc); /* * If the number of buffer_heads in the machine @@ -2741,25 +3010,50 @@ loop_again: break; } - if (!zone_watermark_ok_safe(zone, order, - high_wmark_pages(zone), 0, 0)) { + if (!zone_balanced(zone, order, 0, 0)) { end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } + if (i < 0) goto out; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2770,13 +3064,12 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -2789,189 +3082,61 @@ loop_again: order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; - total_scanned += nr_soft_scanned; - - /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), - (zone->present_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. - */ - testorder = order; - if (COMPACTION_BUILD && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone) + balance_gap, - end_zone, 0)) { - shrink_zone(priority, zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) - sc.may_writepage = 1; - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (!zone_watermark_ok_safe(zone, testorder, - high_wmark_pages(zone), end_zone, 0)) { - all_zones_ok = 0; - /* - * We are still under min water mark. This - * means that we have a GFP_ATOMIC allocation - * failure risk. Hurry up! - */ - if (!zone_watermark_ok_safe(zone, order, - min_wmark_pages(zone), end_zone, 0)) - has_under_min_watermark_zone = 1; - } else { - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * spectulatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= *classzone_idx) - balanced += zone->present_pages; - } - - } - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) - break; /* kswapd: all done */ - /* - * OK, kswapd is getting into trouble. Take a nap, then take - * another pass across the zones. - */ - if (total_scanned && (priority < DEF_PRIORITY - 2)) { - if (has_under_min_watermark_zone) - count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); - else - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } -out: - - /* - * order-0: All zones must meet high watermark for a balanced node - * high-order: Balanced zones must make up at least 25% of the node - * for the node to be balanced - */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { - cond_resched(); - - try_to_freeze(); + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up(&pgdat->pfmemalloc_wait); /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + if (order && sc.nr_reclaimed >= 2UL << order) order = sc.order = 0; - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (zone->all_unreclaimable && priority != DEF_PRIORITY) - continue; - - /* Would compaction fail due to lack of free memory? */ - if (COMPACTION_BUILD && - compaction_suitable(zone, order) == COMPACT_SKIPPED) - goto loop_again; - - /* Confirm the zone is balanced for order-0 */ - if (!zone_watermark_ok(zone, 0, - high_wmark_pages(zone), 0, 0)) { - order = sc.order = 0; - goto loop_again; - } - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - - /* If balanced, clear the congested flag */ - zone_clear_flag(zone, ZONE_CONGESTED); - } + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; - if (zones_need_compaction) + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) compact_pgdat(pgdat, order); - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: /* - * Return the order we were reclaiming at so sleeping_prematurely() + * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level @@ -2991,7 +3156,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3001,7 +3166,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3013,7 +3178,18 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) @@ -3077,7 +3253,7 @@ static int kswapd(void *p) classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { - int ret; + bool ret; /* * If the last balance_pgdat was unsuccessful it's unlikely a @@ -3125,6 +3301,8 @@ static int kswapd(void *p) &balanced_classzone_idx); } } + + current->reclaim_state = NULL; return 0; } @@ -3147,48 +3325,13 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } -/* - * The reclaimable count would be mostly accurate. - * The less reclaimable pages may be - * - mlocked pages, which will be moved to unevictable list when encountered - * - mapped pages, which may require several travels to be reclaimed - * - dirty pages, which is not "instantly" reclaimable - */ -unsigned long global_reclaimable_pages(void) -{ - int nr; - - nr = global_page_state(NR_ACTIVE_FILE) + - global_page_state(NR_INACTIVE_FILE); - - if (nr_swap_pages > 0) - nr += global_page_state(NR_ACTIVE_ANON) + - global_page_state(NR_INACTIVE_ANON); - - return nr; -} - -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (nr_swap_pages > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3209,9 +3352,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .nr_to_reclaim = nr_to_reclaim, .hibernation_mode = 1, .order = 0, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, + .priority = DEF_PRIORITY, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3222,7 +3363,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -3236,13 +3377,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); const struct cpumask *mask; @@ -3272,21 +3413,25 @@ int kswapd_run(int nid) if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); - printk("Failed to start kswapd on node %d\n",nid); - ret = -1; + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; } return ret; } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) @@ -3294,7 +3439,7 @@ static int __init kswapd_init(void) int nid; swap_setup(); - for_each_node_state(nid, N_HIGH_MEMORY) + for_each_node_state(nid, N_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0; @@ -3386,15 +3531,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .nr_to_reclaim = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), - .gfp_mask = gfp_mask, + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, + .priority = ZONE_RECLAIM_PRIORITY, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, @@ -3417,11 +3561,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ - priority = ZONE_RECLAIM_PRIORITY; do { - shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && sc.nr_reclaimed < nr_pages); + shrink_zone(zone, &sc); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -3432,10 +3574,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. */ + nodes_clear(shrink.nodes_to_scan); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); for (;;) { unsigned long lru_pages = zone_reclaimable_pages(zone); @@ -3484,7 +3625,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* @@ -3519,27 +3660,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * page_evictable - test whether a page is evictable * @page: the page to test - * @vma: the VMA in which the page is or will be mapped, may be NULL * * Test whether page is evictable--i.e., should be placed on active/inactive - * lists vs unevictable list. The vma argument is !NULL when called from the - * fault path to determine how to instantate a new page. + * lists vs unevictable list. * * Reasons page might not be evictable: * (1) page's mapping marked unevictable * (2) page is part of an mlocked VMA * */ -int page_evictable(struct page *page, struct vm_area_struct *vma) +int page_evictable(struct page *page) { - - if (mapping_unevictable(page_mapping(page))) - return 0; - - if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) - return 0; - - return 1; + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); } #ifdef CONFIG_SHMEM @@ -3572,20 +3704,18 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) zone = pagezone; spin_lock_irq(&zone->lru_lock); } + lruvec = mem_cgroup_page_lruvec(page, zone); if (!PageLRU(page) || !PageUnevictable(page)) continue; - if (page_evictable(page, NULL)) { + if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); - __dec_zone_state(zone, NR_UNEVICTABLE); - lruvec = mem_cgroup_lru_move_lists(zone, page, - LRU_UNEVICTABLE, lru); - list_move(&page->lru, &lruvec->lists[lru]); - __inc_zone_state(zone, NR_INACTIVE_ANON + lru); + del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, lru); pgrescued++; } } diff --git a/mm/vmstat.c b/mm/vmstat.c index 7db1b9bab492..302dd076b8bf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -19,6 +19,9 @@ #include <linux/math64.h> #include <linux/writeback.h> #include <linux/compaction.h> +#include <linux/mm_inline.h> + +#include "internal.h" #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -52,7 +55,6 @@ void all_vm_events(unsigned long *ret) } EXPORT_SYMBOL_GPL(all_vm_events); -#ifdef CONFIG_HOTPLUG /* * Fold the foreign cpu events into our own. * @@ -69,7 +71,6 @@ void vm_events_fold_cpu(int cpu) fold_state->event[i] = 0; } } -#endif /* CONFIG_HOTPLUG */ #endif /* CONFIG_VM_EVENT_COUNTERS */ @@ -142,7 +143,7 @@ int calculate_normal_threshold(struct zone *zone) * 125 1024 10 16-32 GB 9 */ - mem = zone->present_pages >> (27 - PAGE_SHIFT); + mem = zone->managed_pages >> (27 - PAGE_SHIFT); threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); @@ -416,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif +static inline void fold_diff(int *diff) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (diff[i]) + atomic_long_add(diff[i], &vm_stat[i]); +} + /* - * Update the zone counters for one cpu. - * - * The cpu specified must be either the current cpu or a processor that - * is not online. If it is the current cpu then the execution thread must - * be pinned to the current cpu. + * Update the zone counters for the current cpu. * * Note that refresh_cpu_vm_stats strives to only access * node local memory. The per cpu pagesets on remote zones are placed @@ -434,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state); * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -void refresh_cpu_vm_stats(int cpu) +static void refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_pageset __percpu *p = zone->pageset; - p = per_cpu_ptr(zone->pageset, cpu); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + int v; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { - unsigned long flags; - int v; + v = this_cpu_xchg(p->vm_stat_diff[i], 0); + if (v) { - local_irq_save(flags); - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; - local_irq_restore(flags); atomic_long_add(v, &zone->vm_stat[i]); global_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - p->expire = 3; + __this_cpu_write(p->expire, 3); #endif } + } cond_resched(); #ifdef CONFIG_NUMA /* @@ -470,31 +472,75 @@ void refresh_cpu_vm_stats(int cpu) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!p->expire || !p->pcp.count) + if (!__this_cpu_read(p->expire) || + !__this_cpu_read(p->pcp.count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - p->expire = 0; + __this_cpu_write(p->expire, 0); continue; } - p->expire--; - if (p->expire) + + if (__this_cpu_dec_return(p->expire)) continue; - if (p->pcp.count) - drain_zone_pages(zone, &p->pcp); + if (__this_cpu_read(p->pcp.count)) + drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); #endif } + fold_diff(global_diff); +} - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) - atomic_long_add(global_diff[i], &vm_stat[i]); +/* + * Fold the data for an offline cpu into the global array. + * There cannot be any access by the offline cpu and therefore + * synchronization is simplified. + */ +void cpu_vm_stats_fold(int cpu) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) { + int v; + + v = p->vm_stat_diff[i]; + p->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; + } + } + + fold_diff(global_diff); } +/* + * this is only called if !populated_zone(zone), which implies no other users of + * pset->vm_stat_diff[] exsist. + */ +void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pset->vm_stat_diff[i]) { + int v = pset->vm_stat_diff[i]; + pset->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + atomic_long_add(v, &vm_stat[i]); + } +} #endif #ifdef CONFIG_NUMA @@ -613,7 +659,12 @@ static char * const migratetype_names[MIGRATE_TYPES] = { "Reclaimable", "Movable", "Reserve", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION "Isolate", +#endif }; static void *frag_start(struct seq_file *m, loff_t *pos) @@ -684,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", + "nr_alloc_batch", "nr_inactive_anon", "nr_active_anon", "nr_inactive_file", @@ -718,7 +770,11 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "workingset_refault", + "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", + "nr_free_cma", "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -742,6 +798,7 @@ const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgsteal_direct") TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") + "pgscan_direct_throttle", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -751,16 +808,29 @@ const char * const vmstat_text[] = { "kswapd_inodesteal", "kswapd_low_wmark_hit_quickly", "kswapd_high_wmark_hit_quickly", - "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", "pgrotated", + "drop_pagecache", + "drop_slab", + +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_huge_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION + "pgmigrate_success", + "pgmigrate_fail", +#endif #ifdef CONFIG_COMPACTION - "compact_blocks_moved", - "compact_pages_moved", - "compact_pagemigrate_failed", + "compact_migrate_scanned", + "compact_free_scanned", + "compact_isolated", "compact_stall", "compact_fail", "compact_success", @@ -777,7 +847,6 @@ const char * const vmstat_text[] = { "unevictable_pgs_munlocked", "unevictable_pgs_cleared", "unevictable_pgs_stranded", - "unevictable_pgs_mlockfreed", #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", @@ -785,7 +854,17 @@ const char * const vmstat_text[] = { "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_split", + "thp_zero_page_alloc", + "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH +#ifdef CONFIG_SMP + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", +#endif /* CONFIG_SMP */ + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; @@ -862,7 +941,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, int mtype; unsigned long pfn; unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = start_pfn + zone->spanned_pages; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -914,7 +993,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) pg_data_t *pgdat = (pg_data_t *)arg; /* check memoryless node */ - if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + if (!node_state(pgdat->node_id, N_MEMORY)) return 0; seq_printf(m, "Page block order: %d\n", pageblock_order); @@ -976,14 +1055,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n high %lu" "\n scanned %lu" "\n spanned %lu" - "\n present %lu", + "\n present %lu" + "\n managed %lu", zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), zone->pages_scanned, zone->spanned_pages, - zone->present_pages); + zone->present_pages, + zone->managed_pages); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", vmstat_text[i], @@ -1019,7 +1100,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n all_unreclaimable: %u" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone->all_unreclaimable, + !zone_reclaimable(zone), zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); @@ -1144,24 +1225,38 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(smp_processor_id()); + refresh_cpu_vm_stats(); schedule_delayed_work(&__get_cpu_var(vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } -static void __cpuinit start_cpu_timer(int cpu) +static void start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + INIT_DEFERRABLE_WORK(work, vmstat_update); schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void vmstat_cpu_dead(int node) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (cpu_to_node(cpu) == node) + goto end; + + node_clear_state(node, N_CPU); +end: + put_online_cpus(); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. */ -static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, +static int vmstat_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1186,6 +1281,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); + vmstat_cpu_dead(cpu_to_node(cpu)); break; default: break; @@ -1193,7 +1289,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata vmstat_notifier = +static struct notifier_block vmstat_notifier = { &vmstat_cpuup_callback, NULL, 0 }; #endif @@ -1202,10 +1298,14 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP int cpu; - register_cpu_notifier(&vmstat_notifier); + cpu_notifier_register_begin(); + __register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); + } + cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); @@ -1220,7 +1320,6 @@ module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) #include <linux/debugfs.h> -static struct dentry *extfrag_debug_root; /* * Return an index indicating how much of the available free memory is @@ -1277,7 +1376,7 @@ static int unusable_show(struct seq_file *m, void *arg) pg_data_t *pgdat = (pg_data_t *)arg; /* check memoryless node */ - if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + if (!node_state(pgdat->node_id, N_MEMORY)) return 0; walk_zones_in_node(m, pgdat, unusable_show_print); @@ -1358,19 +1457,24 @@ static const struct file_operations extfrag_file_ops = { static int __init extfrag_debug_init(void) { + struct dentry *extfrag_debug_root; + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); if (!extfrag_debug_root) return -ENOMEM; if (!debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, &unusable_file_ops)) - return -ENOMEM; + goto fail; if (!debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, &extfrag_file_ops)) - return -ENOMEM; + goto fail; return 0; +fail: + debugfs_remove_recursive(extfrag_debug_root); + return -ENOMEM; } module_init(extfrag_debug_init); diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 000000000000..f7216fa7da27 --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,414 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/memcontrol.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/mm.h> + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000000..9451361e6aa7 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,527 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justified" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/zbud.h> + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64, and there + * will be 64 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + struct zbud_ops *ops; +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(int size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. -1 at the + * end for the zbud header. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kmalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (size <= 0 || gfp & __GFP_HIGHMEM) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +#define list_tail_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zbud(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c new file mode 100644 index 000000000000..36b4591a7a2d --- /dev/null +++ b/mm/zsmalloc.c @@ -0,0 +1,1117 @@ +/* + * zsmalloc memory allocator + * + * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the license that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + */ + +/* + * This allocator is designed for use with zram. Thus, the allocator is + * supposed to work well under low memory conditions. In particular, it + * never attempts higher order page allocation which is very likely to + * fail under memory pressure. On the other hand, if we just use single + * (0-order) pages, it would suffer from very high fragmentation -- + * any object of size PAGE_SIZE/2 or larger would occupy an entire page. + * This was one of the major issues with its predecessor (xvmalloc). + * + * To overcome these issues, zsmalloc allocates a bunch of 0-order pages + * and links them together using various 'struct page' fields. These linked + * pages act as a single higher-order page i.e. an object can span 0-order + * page boundaries. The code refers to these linked pages as a single entity + * called zspage. + * + * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE + * since this satisfies the requirements of all its current users (in the + * worst case, page is incompressible and is thus stored "as-is" i.e. in + * uncompressed form). For allocation requests larger than this size, failure + * is returned (see zs_malloc). + * + * Additionally, zs_malloc() does not return a dereferenceable pointer. + * Instead, it returns an opaque handle (unsigned long) which encodes actual + * location of the allocated object. The reason for this indirection is that + * zsmalloc does not keep zspages permanently mapped since that would cause + * issues on 32-bit systems where the VA region for kernel space mappings + * is very small. So, before using the allocating memory, the object has to + * be mapped using zs_map_object() to get a usable pointer and subsequently + * unmapped using zs_unmap_object(). + * + * Following is how we use various fields and flags of underlying + * struct page(s) to form a zspage. + * + * Usage of struct page fields: + * page->first_page: points to the first component (0-order) page + * page->index (union with page->freelist): offset of the first object + * starting in this page. For the first page, this is + * always 0, so we use this field (aka freelist) to point + * to the first free object in zspage. + * page->lru: links together all component pages (except the first page) + * of a zspage + * + * For _first_ page only: + * + * page->private (union with page->first_page): refers to the + * component page after the first page + * page->freelist: points to the first free object in zspage. + * Free objects are linked together using in-place + * metadata. + * page->objects: maximum number of objects we can store in this + * zspage (class->zspage_order * PAGE_SIZE / class->size) + * page->lru: links together first pages of various zspages. + * Basically forming list of zspages in a fullness group. + * page->mapping: class index and fullness group of the zspage + * + * Usage of struct page flags: + * PG_private: identifies the first component page + * PG_private2: identifies the last component page + * + */ + +#ifdef CONFIG_ZSMALLOC_DEBUG +#define DEBUG +#endif + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/highmem.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> +#include <linux/cpumask.h> +#include <linux/cpu.h> +#include <linux/vmalloc.h> +#include <linux/hardirq.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/zsmalloc.h> + +/* + * This must be power of 2 and greater than of equal to sizeof(link_free). + * These two conditions ensure that any 'struct link_free' itself doesn't + * span more than 1 page which avoids complex case of mapping 2 pages simply + * to restore link_free pointer values. + */ +#define ZS_ALIGN 8 + +/* + * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) + * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. + */ +#define ZS_MAX_ZSPAGE_ORDER 2 +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) + +/* + * Object location (<PFN>, <obj_idx>) is encoded as + * as single (unsigned long) handle value. + * + * Note that object index <obj_idx> is relative to system + * page <PFN> it is stored in, so for each sub-page belonging + * to a zspage, obj_idx starts with 0. + * + * This is made more complicated by various memory models and PAE. + */ + +#ifndef MAX_PHYSMEM_BITS +#ifdef CONFIG_HIGHMEM64G +#define MAX_PHYSMEM_BITS 36 +#else /* !CONFIG_HIGHMEM64G */ +/* + * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just + * be PAGE_SHIFT + */ +#define MAX_PHYSMEM_BITS BITS_PER_LONG +#endif +#endif +#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) +#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ +#define ZS_MIN_ALLOC_SIZE \ + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +#define ZS_MAX_ALLOC_SIZE PAGE_SIZE + +/* + * On systems with 4K page size, this gives 254 size classes! There is a + * trader-off here: + * - Large number of size classes is potentially wasteful as free page are + * spread across these classes + * - Small number of size classes causes large internal fragmentation + * - Probably its better to use specific size classes (empirically + * determined). NOTE: all those class sizes must be set as multiple of + * ZS_ALIGN to make sure link_free itself never has to span 2 pages. + * + * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN + * (reason above) + */ +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) +#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ + ZS_SIZE_CLASS_DELTA + 1) + +/* + * We do not maintain any list for completely empty or full pages + */ +enum fullness_group { + ZS_ALMOST_FULL, + ZS_ALMOST_EMPTY, + _ZS_NR_FULLNESS_GROUPS, + + ZS_EMPTY, + ZS_FULL +}; + +/* + * We assign a page to ZS_ALMOST_EMPTY fullness group when: + * n <= N / f, where + * n = number of allocated objects + * N = total number of objects zspage can store + * f = 1/fullness_threshold_frac + * + * Similarly, we assign zspage to: + * ZS_ALMOST_FULL when n > N / f + * ZS_EMPTY when n == 0 + * ZS_FULL when n == N + * + * (see: fix_fullness_group()) + */ +static const int fullness_threshold_frac = 4; + +struct size_class { + /* + * Size of objects stored in this class. Must be multiple + * of ZS_ALIGN. + */ + int size; + unsigned int index; + + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; + + spinlock_t lock; + + /* stats */ + u64 pages_allocated; + + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; +}; + +/* + * Placed within free objects to form a singly linked list. + * For every zspage, first_page->freelist gives head of this list. + * + * This must be power of 2 and less than or equal to ZS_ALIGN + */ +struct link_free { + /* Handle of next free chunk (encodes <PFN, obj_idx>) */ + void *next; +}; + +struct zs_pool { + struct size_class size_class[ZS_SIZE_CLASSES]; + + gfp_t flags; /* allocation flags used when growing pool */ +}; + +/* + * A zspage's class index and fullness group + * are encoded in its (first)page->mapping + */ +#define CLASS_IDX_BITS 28 +#define FULLNESS_BITS 4 +#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) +#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) + +struct mapping_area { +#ifdef CONFIG_PGTABLE_MAPPING + struct vm_struct *vm; /* vm area for mapping object that span pages */ +#else + char *vm_buf; /* copy buffer for objects that span pages */ +#endif + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +}; + + +/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +static DEFINE_PER_CPU(struct mapping_area, zs_map_area); + +static int is_first_page(struct page *page) +{ + return PagePrivate(page); +} + +static int is_last_page(struct page *page) +{ + return PagePrivate2(page); +} + +static void get_zspage_mapping(struct page *page, unsigned int *class_idx, + enum fullness_group *fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = (unsigned long)page->mapping; + *fullness = m & FULLNESS_MASK; + *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; +} + +static void set_zspage_mapping(struct page *page, unsigned int class_idx, + enum fullness_group fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | + (fullness & FULLNESS_MASK); + page->mapping = (struct address_space *)m; +} + +/* + * zsmalloc divides the pool into various size classes where each + * class maintains a list of zspages where each zspage is divided + * into equal sized chunks. Each allocation falls into one of these + * classes depending on its size. This function returns index of the + * size class which has chunk size big enough to hold the give size. + */ +static int get_size_class_index(int size) +{ + int idx = 0; + + if (likely(size > ZS_MIN_ALLOC_SIZE)) + idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, + ZS_SIZE_CLASS_DELTA); + + return idx; +} + +/* + * For each size class, zspages are divided into different groups + * depending on how "full" they are. This was done so that we could + * easily find empty or nearly empty zspages when we try to shrink + * the pool (not yet implemented). This function returns fullness + * status of the given page. + */ +static enum fullness_group get_fullness_group(struct page *page) +{ + int inuse, max_objects; + enum fullness_group fg; + BUG_ON(!is_first_page(page)); + + inuse = page->inuse; + max_objects = page->objects; + + if (inuse == 0) + fg = ZS_EMPTY; + else if (inuse == max_objects) + fg = ZS_FULL; + else if (inuse <= max_objects / fullness_threshold_frac) + fg = ZS_ALMOST_EMPTY; + else + fg = ZS_ALMOST_FULL; + + return fg; +} + +/* + * Each size class maintains various freelists and zspages are assigned + * to one of these freelists based on the number of live objects they + * have. This functions inserts the given zspage into the freelist + * identified by <class, fullness_group>. + */ +static void insert_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + if (*head) + list_add_tail(&page->lru, &(*head)->lru); + + *head = page; +} + +/* + * This function removes the given zspage from the freelist identified + * by <class, fullness_group>. + */ +static void remove_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + BUG_ON(!*head); + if (list_empty(&(*head)->lru)) + *head = NULL; + else if (*head == page) + *head = (struct page *)list_entry((*head)->lru.next, + struct page, lru); + + list_del_init(&page->lru); +} + +/* + * Each size class maintains zspages in different fullness groups depending + * on the number of live objects they contain. When allocating or freeing + * objects, the fullness status of the page can change, say, from ALMOST_FULL + * to ALMOST_EMPTY when freeing an object. This function checks if such + * a status change has occurred for the given page and accordingly moves the + * page from the freelist of the old fullness group to that of the new + * fullness group. + */ +static enum fullness_group fix_fullness_group(struct zs_pool *pool, + struct page *page) +{ + int class_idx; + struct size_class *class; + enum fullness_group currfg, newfg; + + BUG_ON(!is_first_page(page)); + + get_zspage_mapping(page, &class_idx, &currfg); + newfg = get_fullness_group(page); + if (newfg == currfg) + goto out; + + class = &pool->size_class[class_idx]; + remove_zspage(page, class, currfg); + insert_zspage(page, class, newfg); + set_zspage_mapping(page, class_idx, newfg); + +out: + return newfg; +} + +/* + * We have to decide on how many pages to link together + * to form a zspage for each size class. This is important + * to reduce wastage due to unusable space left at end of + * each zspage which is given as: + * wastage = Zp - Zp % size_class + * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... + * + * For example, for size class of 3/8 * PAGE_SIZE, we should + * link together 3 PAGE_SIZE sized pages to form a zspage + * since then we can perfectly fit in 8 such objects. + */ +static int get_pages_per_zspage(int class_size) +{ + int i, max_usedpc = 0; + /* zspage order which gives maximum used size per KB */ + int max_usedpc_order = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int zspage_size; + int waste, usedpc; + + zspage_size = i * PAGE_SIZE; + waste = zspage_size % class_size; + usedpc = (zspage_size - waste) * 100 / zspage_size; + + if (usedpc > max_usedpc) { + max_usedpc = usedpc; + max_usedpc_order = i; + } + } + + return max_usedpc_order; +} + +/* + * A single 'zspage' is composed of many system pages which are + * linked together using fields in struct page. This function finds + * the first/head page, given any component page of a zspage. + */ +static struct page *get_first_page(struct page *page) +{ + if (is_first_page(page)) + return page; + else + return page->first_page; +} + +static struct page *get_next_page(struct page *page) +{ + struct page *next; + + if (is_last_page(page)) + next = NULL; + else if (is_first_page(page)) + next = (struct page *)page_private(page); + else + next = list_entry(page->lru.next, struct page, lru); + + return next; +} + +/* + * Encode <page, obj_idx> as a single handle value. + * On hardware platforms with physical memory starting at 0x0 the pfn + * could be 0 so we ensure that the handle will never be 0 by adjusting the + * encoded obj_idx value before encoding. + */ +static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +{ + unsigned long handle; + + if (!page) { + BUG_ON(obj_idx); + return NULL; + } + + handle = page_to_pfn(page) << OBJ_INDEX_BITS; + handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); + + return (void *)handle; +} + +/* + * Decode <page, obj_idx> pair from the given object handle. We adjust the + * decoded obj_idx back to its original value since it was adjusted in + * obj_location_to_handle(). + */ +static void obj_handle_to_location(unsigned long handle, struct page **page, + unsigned long *obj_idx) +{ + *page = pfn_to_page(handle >> OBJ_INDEX_BITS); + *obj_idx = (handle & OBJ_INDEX_MASK) - 1; +} + +static unsigned long obj_idx_to_offset(struct page *page, + unsigned long obj_idx, int class_size) +{ + unsigned long off = 0; + + if (!is_first_page(page)) + off = page->index; + + return off + obj_idx * class_size; +} + +static void reset_page(struct page *page) +{ + clear_bit(PG_private, &page->flags); + clear_bit(PG_private_2, &page->flags); + set_page_private(page, 0); + page->mapping = NULL; + page->freelist = NULL; + page_mapcount_reset(page); +} + +static void free_zspage(struct page *first_page) +{ + struct page *nextp, *tmp, *head_extra; + + BUG_ON(!is_first_page(first_page)); + BUG_ON(first_page->inuse); + + head_extra = (struct page *)page_private(first_page); + + reset_page(first_page); + __free_page(first_page); + + /* zspage with only 1 system page */ + if (!head_extra) + return; + + list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { + list_del(&nextp->lru); + reset_page(nextp); + __free_page(nextp); + } + reset_page(head_extra); + __free_page(head_extra); +} + +/* Initialize a newly allocated zspage */ +static void init_zspage(struct page *first_page, struct size_class *class) +{ + unsigned long off = 0; + struct page *page = first_page; + + BUG_ON(!is_first_page(first_page)); + while (page) { + struct page *next_page; + struct link_free *link; + unsigned int i, objs_on_page; + + /* + * page->index stores offset of first object starting + * in the page. For the first page, this is always 0, + * so we use first_page->index (aka ->freelist) to store + * head of corresponding zspage's freelist. + */ + if (page != first_page) + page->index = off; + + link = (struct link_free *)kmap_atomic(page) + + off / sizeof(*link); + objs_on_page = (PAGE_SIZE - off) / class->size; + + for (i = 1; i <= objs_on_page; i++) { + off += class->size; + if (off < PAGE_SIZE) { + link->next = obj_location_to_handle(page, i); + link += class->size / sizeof(*link); + } + } + + /* + * We now come to the last (full or partial) object on this + * page, which must point to the first object on the next + * page (if present) + */ + next_page = get_next_page(page); + link->next = obj_location_to_handle(next_page, 0); + kunmap_atomic(link); + page = next_page; + off = (off + class->size) % PAGE_SIZE; + } +} + +/* + * Allocate a zspage for the given size class + */ +static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +{ + int i, error; + struct page *first_page = NULL, *uninitialized_var(prev_page); + + /* + * Allocate individual pages and link them together as: + * 1. first page->private = first sub-page + * 2. all sub-pages are linked together using page->lru + * 3. each sub-page is linked to the first page using page->first_page + * + * For each size class, First/Head pages are linked together using + * page->lru. Also, we set PG_private to identify the first page + * (i.e. no other sub-page has this flag set) and PG_private_2 to + * identify the last page. + */ + error = -ENOMEM; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; + + page = alloc_page(flags); + if (!page) + goto cleanup; + + INIT_LIST_HEAD(&page->lru); + if (i == 0) { /* first page */ + SetPagePrivate(page); + set_page_private(page, 0); + first_page = page; + first_page->inuse = 0; + } + if (i == 1) + set_page_private(first_page, (unsigned long)page); + if (i >= 1) + page->first_page = first_page; + if (i >= 2) + list_add(&page->lru, &prev_page->lru); + if (i == class->pages_per_zspage - 1) /* last page */ + SetPagePrivate2(page); + prev_page = page; + } + + init_zspage(first_page, class); + + first_page->freelist = obj_location_to_handle(first_page, 0); + /* Maximum number of objects we can store in this zspage */ + first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + + error = 0; /* Success */ + +cleanup: + if (unlikely(error) && first_page) { + free_zspage(first_page); + first_page = NULL; + } + + return first_page; +} + +static struct page *find_get_zspage(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) + break; + } + + return page; +} + +#ifdef CONFIG_PGTABLE_MAPPING +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm) + return 0; + area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + if (!area->vm) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + if (area->vm) + free_vm_area(area->vm); + area->vm = NULL; +} + +static inline void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + area->vm_addr = area->vm->addr; + return area->vm_addr + off; +} + +static inline void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + unsigned long addr = (unsigned long)area->vm_addr; + + unmap_kernel_range(addr, PAGE_SIZE * 2); +} + +#else /* CONFIG_PGTABLE_MAPPING */ + +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm_buf) + return 0; + area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + if (!area->vm_buf) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + if (area->vm_buf) + free_page((unsigned long)area->vm_buf); + area->vm_buf = NULL; +} + +static void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* disable page faults to match kmap_atomic() return conditions */ + pagefault_disable(); + + /* no read fastpath */ + if (area->vm_mm == ZS_MM_WO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy object to per-cpu buffer */ + addr = kmap_atomic(pages[0]); + memcpy(buf, addr + off, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(buf + sizes[0], addr, sizes[1]); + kunmap_atomic(addr); +out: + return area->vm_buf; +} + +static void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* no write fastpath */ + if (area->vm_mm == ZS_MM_RO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy per-cpu buffer to object */ + addr = kmap_atomic(pages[0]); + memcpy(addr + off, buf, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(addr, buf + sizes[0], sizes[1]); + kunmap_atomic(addr); + +out: + /* enable page faults to match kunmap_atomic() return conditions */ + pagefault_enable(); +} + +#endif /* CONFIG_PGTABLE_MAPPING */ + +static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, + void *pcpu) +{ + int ret, cpu = (long)pcpu; + struct mapping_area *area; + + switch (action) { + case CPU_UP_PREPARE: + area = &per_cpu(zs_map_area, cpu); + ret = __zs_cpu_up(area); + if (ret) + return notifier_from_errno(ret); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block zs_cpu_nb = { + .notifier_call = zs_cpu_notifier +}; + +static void zs_exit(void) +{ + int cpu; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) + zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); +} + +static int zs_init(void) +{ + int cpu, ret; + + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); + for_each_online_cpu(cpu) { + ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + if (notifier_to_errno(ret)) { + cpu_notifier_register_done(); + goto fail; + } + } + + cpu_notifier_register_done(); + + return 0; +fail: + zs_exit(); + return notifier_to_errno(ret); +} + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(gfp_t flags) +{ + int i, ovhd_size; + struct zs_pool *pool; + + ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); + pool = kzalloc(ovhd_size, GFP_KERNEL); + if (!pool) + return NULL; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + int size; + struct size_class *class; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + + class = &pool->size_class[i]; + class->size = size; + class->index = i; + spin_lock_init(&class->lock); + class->pages_per_zspage = get_pages_per_zspage(size); + + } + + pool->flags = flags; + + return pool; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + int fg; + struct size_class *class = &pool->size_class[i]; + + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + } + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * + * On success, handle to the allocated object is returned, + * otherwise 0. + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size) +{ + unsigned long obj; + struct link_free *link; + int class_idx; + struct size_class *class; + + struct page *first_page, *m_page; + unsigned long m_objidx, m_offset; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return 0; + + class_idx = get_size_class_index(size); + class = &pool->size_class[class_idx]; + BUG_ON(class_idx != class->index); + + spin_lock(&class->lock); + first_page = find_get_zspage(class); + + if (!first_page) { + spin_unlock(&class->lock); + first_page = alloc_zspage(class, pool->flags); + if (unlikely(!first_page)) + return 0; + + set_zspage_mapping(first_page, class->index, ZS_EMPTY); + spin_lock(&class->lock); + class->pages_allocated += class->pages_per_zspage; + } + + obj = (unsigned long)first_page->freelist; + obj_handle_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + link = (struct link_free *)kmap_atomic(m_page) + + m_offset / sizeof(*link); + first_page->freelist = link->next; + memset(link, POISON_INUSE, sizeof(*link)); + kunmap_atomic(link); + + first_page->inuse++; + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(pool, first_page); + spin_unlock(&class->lock); + + return obj; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +void zs_free(struct zs_pool *pool, unsigned long obj) +{ + struct link_free *link; + struct page *first_page, *f_page; + unsigned long f_objidx, f_offset; + + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!obj)) + return; + + obj_handle_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = &pool->size_class[class_idx]; + f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + + spin_lock(&class->lock); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) + + f_offset); + link->next = first_page->freelist; + kunmap_atomic(link); + first_page->freelist = (void *)obj; + + first_page->inuse--; + fullness = fix_fullness_group(pool, first_page); + + if (fullness == ZS_EMPTY) + class->pages_allocated -= class->pages_per_zspage; + + spin_unlock(&class->lock); + + if (fullness == ZS_EMPTY) + free_zspage(first_page); +} +EXPORT_SYMBOL_GPL(zs_free); + +/** + * zs_map_object - get address of allocated object from handle. + * @pool: pool from which the object was allocated + * @handle: handle returned from zs_malloc + * + * Before using an object allocated from zs_malloc, it must be mapped using + * this function. When done with the object, it must be unmapped using + * zs_unmap_object. + * + * Only one object can be mapped per cpu at a time. There is no protection + * against nested mappings. + * + * This function returns with preemption and page faults disabled. + */ +void *zs_map_object(struct zs_pool *pool, unsigned long handle, + enum zs_mapmode mm) +{ + struct page *page; + unsigned long obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; + + BUG_ON(!handle); + + /* + * Because we use per-cpu mapping areas shared among the + * pools/users, we can't allow mapping in interrupt context + * because it can corrupt another users mappings. + */ + BUG_ON(in_interrupt()); + + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = &pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = &get_cpu_var(zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + area->vm_addr = kmap_atomic(page); + return area->vm_addr + off; + } + + /* this object spans two pages */ + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + return __zs_map_object(area, pages, off, class->size); +} +EXPORT_SYMBOL_GPL(zs_map_object); + +void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +{ + struct page *page; + unsigned long obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + + BUG_ON(!handle); + + obj_handle_to_location(handle, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = &pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = &__get_cpu_var(zs_map_area); + if (off + class->size <= PAGE_SIZE) + kunmap_atomic(area->vm_addr); + else { + struct page *pages[2]; + + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + __zs_unmap_object(area, pages, off, class->size); + } + put_cpu_var(zs_map_area); +} +EXPORT_SYMBOL_GPL(zs_unmap_object); + +u64 zs_get_total_size_bytes(struct zs_pool *pool) +{ + int i; + u64 npages = 0; + + for (i = 0; i < ZS_SIZE_CLASSES; i++) + npages += pool->size_class[i].pages_allocated; + + return npages << PAGE_SHIFT; +} +EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); + +module_init(zs_init); +module_exit(zs_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); diff --git a/mm/zswap.c b/mm/zswap.c new file mode 100644 index 000000000000..aeaef0fb5624 --- /dev/null +++ b/mm/zswap.c @@ -0,0 +1,940 @@ +/* + * zswap.c - zswap driver file + * + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. + * + * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/frontswap.h> +#include <linux/rbtree.h> +#include <linux/swap.h> +#include <linux/crypto.h> +#include <linux/mempool.h> +#include <linux/zbud.h> + +#include <linux/mm_types.h> +#include <linux/page-flags.h> +#include <linux/swapops.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> + +/********************************* +* statistics +**********************************/ +/* Number of memory pages used by the compressed pool */ +static u64 zswap_pool_pages; +/* The number of compressed pages currently stored in zswap */ +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); + +/* + * The statistics below are not protected from concurrent access for + * performance reasons so they may not be a 100% accurate. However, + * they do provide useful information on roughly how many times a + * certain event is occurring. +*/ + +/* Pool limit was hit (see zswap_max_pool_percent) */ +static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ +static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ +static u64 zswap_reject_compress_poor; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ +static u64 zswap_reject_kmemcache_fail; +/* Duplicate store was encountered (rare) */ +static u64 zswap_duplicate_entry; + +/********************************* +* tunables +**********************************/ +/* Enable/disable zswap (disabled by default, fixed at boot for now) */ +static bool zswap_enabled __read_mostly; +module_param_named(enabled, zswap_enabled, bool, 0444); + +/* Compressor to be used by zswap (fixed at boot for now) */ +#define ZSWAP_COMPRESSOR_DEFAULT "lzo" +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +module_param_named(compressor, zswap_compressor, charp, 0444); + +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, + zswap_max_pool_percent, uint, 0644); + +/* zbud_pool is shared by all of zswap backend */ +static struct zbud_pool *zswap_pool; + +/********************************* +* compression functions +**********************************/ +/* per-cpu compression transforms */ +static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; + +enum comp_op { + ZSWAP_COMPOP_COMPRESS, + ZSWAP_COMPOP_DECOMPRESS +}; + +static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZSWAP_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZSWAP_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zswap_comp_init(void) +{ + if (!crypto_has_comp(zswap_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zswap_compressor); + /* fall back to default compressor */ + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zswap_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zswap_compressor); + + /* alloc percpu transforms */ + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zswap_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zswap_comp_exit(void) +{ + /* free percpu transforms */ + if (zswap_comp_pcpu_tfms) + free_percpu(zswap_comp_pcpu_tfms); +} + +/********************************* +* data structures +**********************************/ +/* + * struct zswap_entry + * + * This structure contains the metadata for tracking a single compressed + * page within zswap. + * + * rbnode - links the entry into red-black tree for the appropriate swap type + * refcount - the number of outstanding reference to the entry. This is needed + * to protect against premature freeing of the entry by code + * concurrent calls to load, invalidate, and writeback. The lock + * for the zswap_tree structure that contains the entry must + * be held while changing the refcount. Since the lock must + * be held, there is no reason to also make refcount atomic. + * offset - the swap offset for the entry. Index into the red-black tree. + * handle - zbud allocation handle that stores the compressed page data + * length - the length in bytes of the compressed page data. Needed during + * decompression + */ +struct zswap_entry { + struct rb_node rbnode; + pgoff_t offset; + int refcount; + unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; +}; + +/* + * The tree lock in the zswap_tree struct protects a few things: + * - the rbtree + * - the refcount field of each entry in the tree + */ +struct zswap_tree { + struct rb_root rbroot; + spinlock_t lock; +}; + +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static int zswap_entry_cache_create(void) +{ + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return zswap_entry_cache == NULL; +} + +static void zswap_entry_cache_destory(void) +{ + kmem_cache_destroy(zswap_entry_cache); +} + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc(zswap_entry_cache, gfp); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + if (entry->offset > offset) + node = node->rb_left; + else if (entry->offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + if (myentry->offset > entry->offset) + link = &(*link)->rb_left; + else if (myentry->offset < entry->offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zbud allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_entry *entry) +{ + zbud_free(zswap_pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + +/********************************* +* per-cpu code +**********************************/ +static DEFINE_PER_CPU(u8 *, zswap_dstmem); + +static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zswap_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zswap_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + return __zswap_cpu_notifier(action, cpu); +} + +static struct notifier_block zswap_cpu_notifier_block = { + .notifier_call = zswap_cpu_notifier +}; + +static int zswap_cpu_init(void) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + +/********************************* +* helpers +**********************************/ +static bool zswap_is_full(void) +{ + return totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages; +} + +/********************************* +* writeback code +**********************************/ +/* return enum for zswap_get_swap_cache_page */ +enum zswap_get_swap_ret { + ZSWAP_SWAPCACHE_NEW, + ZSWAP_SWAPCACHE_EXIST, + ZSWAP_SWAPCACHE_FAIL, +}; + +/* + * zswap_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * This function tries to find a page with the given swap entry + * in the swapper_space address space (the swap cache). If the page + * is found, it is returned in retpage. Otherwise, a page is allocated, + * added to the swap cache, and returned in retpage. + * + * If success, the swap cache page is returned in retpage + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error + */ +static int zswap_get_swap_cache_page(swp_entry_t entry, + struct page **retpage) +{ + struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = swap_address_space(entry); + int err; + + *retpage = NULL; + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page(GFP_KERNEL); + if (!new_page) + break; /* Out of memory */ + } + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + lru_cache_add_anon(new_page); + *retpage = new_page; + return ZSWAP_SWAPCACHE_NEW; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry, NULL); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + if (!found_page) + return ZSWAP_SWAPCACHE_FAIL; + *retpage = found_page; + return ZSWAP_SWAPCACHE_EXIST; +} + +/* + * Attempts to free an entry by adding a page to the swap cache, + * decompressing the entry data into the page, and issuing a + * bio write to write the page back to the swap device. + * + * This can be thought of as a "resumed writeback" of the page + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the frontswap_store() + * in the first place. After the page has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +{ + struct zswap_header *zhdr; + swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; + u8 *src, *dst; + unsigned int dlen; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* extract swpentry from data */ + zhdr = zbud_map(pool, handle); + swpentry = zhdr->swpentry; /* here */ + zbud_unmap(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); + + /* try to allocate swap cache page */ + switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + ret = -ENOMEM; + goto fail; + + case ZSWAP_SWAPCACHE_EXIST: + /* page is already in the swap cache, ignore for now */ + page_cache_release(page); + ret = -EEXIST; + goto fail; + + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(zswap_pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(zswap_pool, entry->handle); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* page is up to date */ + SetPageUptodate(page); + } + + /* move it to the tail of the inactive list after end_writeback */ + SetPageReclaim(page); + + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; + + spin_lock(&tree->lock); + /* drop local reference */ + zswap_entry_put(tree, entry); + + /* + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + goto end; + + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ +fail: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + +end: + return ret; +} + +/********************************* +* frontswap hooks +**********************************/ +/* attempts to compress and store an single page */ +static int zswap_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *dupentry; + int ret; + unsigned int dlen = PAGE_SIZE, len; + unsigned long handle; + char *buf; + u8 *src, *dst; + struct zswap_header *zhdr; + + if (!tree) { + ret = -ENODEV; + goto reject; + } + + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zbud_reclaim_page(zswap_pool, 8)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } + } + + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL); + if (!entry) { + zswap_reject_kmemcache_fail++; + ret = -ENOMEM; + goto reject; + } + + /* compress */ + dst = get_cpu_var(zswap_dstmem); + src = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); + kunmap_atomic(src); + if (ret) { + ret = -EINVAL; + goto freepage; + } + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto freepage; + } + if (ret) { + zswap_reject_alloc_fail++; + goto freepage; + } + zhdr = zbud_map(zswap_pool, handle); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); + memcpy(buf, dst, dlen); + zbud_unmap(zswap_pool, handle); + put_cpu_var(zswap_dstmem); + + /* populate entry */ + entry->offset = offset; + entry->handle = handle; + entry->length = dlen; + + /* map */ + spin_lock(&tree->lock); + do { + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); + if (ret == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); + } + } while (ret == -EEXIST); + spin_unlock(&tree->lock); + + /* update stats */ + atomic_inc(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); + + return 0; + +freepage: + put_cpu_var(zswap_dstmem); + zswap_entry_cache_free(entry); +reject: + return ret; +} + +/* + * returns 0 if the page was successfully decompressed + * return -1 on entry not found or error +*/ +static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + u8 *src, *dst; + unsigned int dlen; + int ret; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return -1; + } + spin_unlock(&tree->lock); + + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(zswap_pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(zswap_pool, entry->handle); + BUG_ON(ret); + + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + return 0; +} + +/* frees an entry in zswap */ +static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return; + } + + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); + + /* drop the initial reference from entry creation */ + zswap_entry_put(tree, entry); + + spin_unlock(&tree->lock); +} + +/* frees all zswap entries for the given swap type */ +static void zswap_frontswap_invalidate_area(unsigned type) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *n; + + if (!tree) + return; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + kfree(tree); + zswap_trees[type] = NULL; +} + +static struct zbud_ops zswap_zbud_ops = { + .evict = zswap_writeback_entry +}; + +static void zswap_frontswap_init(unsigned type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; +} + +static struct frontswap_ops zswap_frontswap_ops = { + .store = zswap_frontswap_store, + .load = zswap_frontswap_load, + .invalidate_page = zswap_frontswap_invalidate_page, + .invalidate_area = zswap_frontswap_invalidate_area, + .init = zswap_frontswap_init +}; + +/********************************* +* debugfs functions +**********************************/ +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> + +static struct dentry *zswap_debugfs_root; + +static int __init zswap_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); + if (!zswap_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", S_IRUGO, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", S_IRUGO, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", S_IRUGO, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_pages", S_IRUGO, + zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_atomic_t("stored_pages", S_IRUGO, + zswap_debugfs_root, &zswap_stored_pages); + + return 0; +} + +static void __exit zswap_debugfs_exit(void) +{ + debugfs_remove_recursive(zswap_debugfs_root); +} +#else +static int __init zswap_debugfs_init(void) +{ + return 0; +} + +static void __exit zswap_debugfs_exit(void) { } +#endif + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + if (!zswap_enabled) + return 0; + + pr_info("loading zswap\n"); + + zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!zswap_pool) { + pr_err("zbud pool creation failed\n"); + goto error; + } + + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto cachefail; + } + if (zswap_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zswap_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + + frontswap_register_ops(&zswap_frontswap_ops); + if (zswap_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + return 0; +pcpufail: + zswap_comp_exit(); +compfail: + zswap_entry_cache_destory(); +cachefail: + zbud_destroy_pool(zswap_pool); +error: + return -ENOMEM; +} +/* must be late so crypto has time to come up */ +late_initcall(init_zswap); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Compressed cache for swap pages"); |