summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2019-08-30 23:09:59 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2019-08-30 23:09:59 +1000
commit14f4bec14a4bd5857ca9587221e30d5109157981 (patch)
treebd620829eab841806c48c1aaa63d37d6d54c84bf
parentb75aecfbe531ca3e2af4a33e88bd27583e05852c (diff)
parent97637fe0837ce1bb998be55588f20ae31e38b23e (diff)
downloadlinux-14f4bec14a4bd5857ca9587221e30d5109157981.tar.gz
linux-14f4bec14a4bd5857ca9587221e30d5109157981.tar.xz
Merge branch 'akpm-current/current'
-rw-r--r--.mailmap3
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab13
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst20
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst20
-rw-r--r--Documentation/core-api/genalloc.rst2
-rw-r--r--Documentation/core-api/kernel-api.rst3
-rw-r--r--Documentation/core-api/memory-allocation.rst4
-rw-r--r--Documentation/filesystems/proc.txt4
-rw-r--r--Documentation/process/deprecated.rst10
-rw-r--r--arch/Kconfig11
-rw-r--r--arch/alpha/include/asm/pgalloc.h2
-rw-r--r--arch/alpha/include/asm/pgtable.h5
-rw-r--r--arch/alpha/include/uapi/asm/mman.h3
-rw-r--r--arch/arc/include/asm/pgalloc.h1
-rw-r--r--arch/arc/include/asm/pgtable.h5
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/pgalloc.h2
-rw-r--r--arch/arm/include/asm/pgtable-nommu.h5
-rw-r--r--arch/arm/include/asm/pgtable.h2
-rw-r--r--arch/arm/include/asm/processor.h2
-rw-r--r--arch/arm/include/asm/xen/page-coherent.h3
-rw-r--r--arch/arm/kernel/process.c5
-rw-r--r--arch/arm/mm/dma-mapping.c2
-rw-r--r--arch/arm/mm/flush.c7
-rw-r--r--arch/arm/mm/mmap.c52
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/include/asm/pgalloc.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/include/asm/processor.h2
-rw-r--r--arch/arm64/include/asm/xen/page-coherent.h3
-rw-r--r--arch/arm64/kernel/process.c8
-rw-r--r--arch/arm64/mm/flush.c3
-rw-r--r--arch/arm64/mm/mmap.c72
-rw-r--r--arch/arm64/mm/pgd.c2
-rw-r--r--arch/c6x/include/asm/pgtable.h5
-rw-r--r--arch/csky/include/asm/pgalloc.h2
-rw-r--r--arch/csky/include/asm/pgtable.h5
-rw-r--r--arch/h8300/include/asm/pgtable.h6
-rw-r--r--arch/hexagon/include/asm/pgalloc.h2
-rw-r--r--arch/hexagon/include/asm/pgtable.h3
-rw-r--r--arch/hexagon/mm/Makefile2
-rw-r--r--arch/hexagon/mm/pgalloc.c10
-rw-r--r--arch/ia64/Kconfig4
-rw-r--r--arch/ia64/include/asm/pgalloc.h52
-rw-r--r--arch/ia64/include/asm/pgtable.h5
-rw-r--r--arch/ia64/mm/init.c2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h7
-rw-r--r--arch/m68k/include/asm/pgtable_no.h7
-rw-r--r--arch/microblaze/include/asm/pgalloc.h122
-rw-r--r--arch/microblaze/include/asm/pgtable.h7
-rw-r--r--arch/microblaze/mm/pgtable.c4
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/pgalloc.h2
-rw-r--r--arch/mips/include/asm/pgtable.h5
-rw-r--r--arch/mips/include/asm/processor.h5
-rw-r--r--arch/mips/include/uapi/asm/mman.h3
-rw-r--r--arch/mips/mm/mmap.c84
-rw-r--r--arch/nds32/include/asm/pgalloc.h2
-rw-r--r--arch/nds32/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/pgalloc.h2
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/pgalloc.h2
-rw-r--r--arch/openrisc/include/asm/pgtable.h5
-rw-r--r--arch/parisc/include/asm/pgalloc.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/include/uapi/asm/mman.h3
-rw-r--r--arch/powerpc/include/asm/pgalloc.h2
-rw-r--r--arch/powerpc/include/asm/pgtable.h1
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c2
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c7
-rw-r--r--arch/powerpc/mm/hugetlbpage.c2
-rw-r--r--arch/riscv/Kconfig12
-rw-r--r--arch/riscv/include/asm/pgalloc.h4
-rw-r--r--arch/riscv/include/asm/pgtable.h5
-rw-r--r--arch/s390/include/asm/pgtable.h6
-rw-r--r--arch/sh/include/asm/pgalloc.h44
-rw-r--r--arch/sh/include/asm/pgtable.h5
-rw-r--r--arch/sh/mm/Kconfig3
-rw-r--r--arch/sh/mm/nommu.c4
-rw-r--r--arch/sparc/include/asm/pgalloc_32.h2
-rw-r--r--arch/sparc/include/asm/pgalloc_64.h2
-rw-r--r--arch/sparc/include/asm/pgtable_32.h5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h1
-rw-r--r--arch/sparc/mm/init_32.c1
-rw-r--r--arch/um/include/asm/pgalloc.h2
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/unicore32/include/asm/pgalloc.h2
-rw-r--r--arch/unicore32/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/mm/pat_rbtree.c19
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/xtensa/include/asm/pgtable.h1
-rw-r--r--arch/xtensa/include/asm/tlbflush.h3
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h3
-rw-r--r--drivers/base/memory.c44
-rw-r--r--drivers/base/node.c55
-rw-r--r--drivers/block/drbd/drbd_interval.c29
-rw-r--r--drivers/crypto/chelsio/chtls/chtls_io.c5
-rw-r--r--drivers/gpu/drm/via/via_dmablit.c10
-rw-r--r--drivers/infiniband/core/umem.c5
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c5
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c5
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c5
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c10
-rw-r--r--drivers/misc/sram-exec.c2
-rw-r--r--drivers/staging/android/ion/ion_system_heap.c4
-rw-r--r--drivers/target/tcm_fc/tfc_io.c3
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c8
-rw-r--r--fs/aio.c9
-rw-r--r--fs/binfmt_elf.c20
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/f2fs/data.c1
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c22
-rw-r--r--fs/inode.c3
-rw-r--r--fs/io_uring.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/ocfs2/alloc.c20
-rw-r--r--fs/ocfs2/aops.c38
-rw-r--r--fs/ocfs2/blockcheck.c26
-rw-r--r--fs/ocfs2/cluster/heartbeat.c103
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c55
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h16
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c23
-rw-r--r--fs/ocfs2/dlmglue.c20
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.h42
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c56
-rw-r--r--fs/open.c8
-rw-r--r--fs/proc/meminfo.c8
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/ramfs/inode.c12
-rw-r--r--fs/reiserfs/do_balan.c15
-rw-r--r--fs/reiserfs/fix_node.c6
-rw-r--r--fs/reiserfs/journal.c22
-rw-r--r--fs/reiserfs/lbalance.c3
-rw-r--r--fs/reiserfs/objectid.c3
-rw-r--r--fs/reiserfs/prints.c3
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--include/asm-generic/bug.h53
-rw-r--r--include/asm-generic/pgalloc.h5
-rw-r--r--include/asm-generic/pgtable.h7
-rw-r--r--include/linux/compaction.h22
-rw-r--r--include/linux/cpumask.h14
-rw-r--r--include/linux/fs.h32
-rw-r--r--include/linux/genalloc.h2
-rw-r--r--include/linux/huge_mm.h16
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/interval_tree_generic.h22
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/kgdb.h2
-rw-r--r--include/linux/khugepaged.h12
-rw-r--r--include/linux/memcontrol.h49
-rw-r--r--include/linux/memory.h7
-rw-r--r--include/linux/memremap.h6
-rw-r--r--include/linux/mm.h33
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mm_types_task.h4
-rw-r--r--include/linux/mmzone.h19
-rw-r--r--include/linux/page_ext.h1
-rw-r--r--include/linux/pagemap.h10
-rw-r--r--include/linux/printk.h22
-rw-r--r--include/linux/quicklist.h94
-rw-r--r--include/linux/rbtree_augmented.h88
-rw-r--r--include/linux/sched.h9
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/shrinker.h7
-rw-r--r--include/linux/slab.h66
-rw-r--r--include/linux/string.h50
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/vmalloc.h20
-rw-r--r--include/linux/wait.h4
-rw-r--r--include/linux/zpool.h3
-rw-r--r--include/trace/events/vmscan.h71
-rw-r--r--include/trace/events/writeback.h38
-rw-r--r--include/uapi/asm-generic/mman-common.h3
-rw-r--r--include/uapi/linux/coff.h5
-rw-r--r--init/main.c6
-rw-r--r--ipc/mqueue.c22
-rw-r--r--ipc/msg.c18
-rw-r--r--kernel/debug/debug_core.c31
-rw-r--r--kernel/dma/remap.c2
-rw-r--r--kernel/elfcore.c1
-rw-r--r--kernel/events/uprobes.c81
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/hung_task.c94
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/panic.c42
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/sysctl.c14
-rw-r--r--lib/Kconfig.debug21
-rw-r--r--lib/Kconfig.kasan8
-rw-r--r--lib/bug.c11
-rw-r--r--lib/extable.c1
-rw-r--r--lib/genalloc.c5
-rw-r--r--lib/generic-radix-tree.c4
-rw-r--r--lib/hexdump.c21
-rw-r--r--lib/iov_iter.c2
-rw-r--r--lib/math/rational.c63
-rw-r--r--lib/rbtree_test.c37
-rw-r--r--lib/show_mem.c5
-rw-r--r--lib/string.c12
-rw-r--r--lib/test_kasan.c41
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Kconfig.debug4
-rw-r--r--mm/Makefile4
-rw-r--r--mm/compaction.c50
-rw-r--r--mm/filemap.c168
-rw-r--r--mm/gup.c125
-rw-r--r--mm/huge_memory.c136
-rw-r--r--mm/hugetlb.c89
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kasan/common.c32
-rw-r--r--mm/kasan/kasan.h14
-rw-r--r--mm/kasan/report.c44
-rw-r--r--mm/kasan/tags_report.c24
-rw-r--r--mm/khugepaged.c366
-rw-r--r--mm/kmemleak.c326
-rw-r--r--mm/ksm.c18
-rw-r--r--mm/madvise.c326
-rw-r--r--mm/memcontrol.c228
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c21
-rw-r--r--mm/memory_hotplug.c101
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/memremap.c20
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c84
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c87
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/page_owner.c123
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/page_vma_mapped.c3
-rw-r--r--mm/quicklist.c103
-rw-r--r--mm/rmap.c29
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/slab.h64
-rw-r--r--mm/slab_common.c56
-rw-r--r--mm/slob.c64
-rw-r--r--mm/slub.c34
-rw-r--r--mm/sparse.c18
-rw-r--r--mm/swap.c58
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/util.c122
-rw-r--r--mm/vmalloc.c81
-rw-r--r--mm/vmscan.c316
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/z3fold.c1
-rw-r--r--mm/zpool.c16
-rw-r--r--mm/zsmalloc.c21
-rw-r--r--mm/zswap.c8
-rw-r--r--net/xdp/xdp_umem.c9
-rw-r--r--net/xdp/xsk.c2
-rwxr-xr-xscripts/checkpatch.pl75
-rw-r--r--scripts/gdb/linux/symbols.py4
-rw-r--r--tools/include/linux/rbtree.h71
-rw-r--r--tools/include/linux/rbtree_augmented.h119
-rw-r--r--tools/lib/rbtree.c37
-rw-r--r--usr/Makefile3
279 files changed, 3883 insertions, 2997 deletions
diff --git a/.mailmap b/.mailmap
index 54fb6af3fa38..7c2e5ea26976 100644
--- a/.mailmap
+++ b/.mailmap
@@ -66,6 +66,9 @@ Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dczhu@mips.com>
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+Dmitry Safonov <0x7f454c46@gmail.com> <dsafonov@virtuozzo.com>
+Dmitry Safonov <0x7f454c46@gmail.com> <d.safonov@partner.samsung.com>
+Dmitry Safonov <0x7f454c46@gmail.com> <dima@arista.com>
Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net>
Ed L. Cashin <ecashin@coraid.com>
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d93a1c2..ed35833ad7f0 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -429,10 +429,15 @@ KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Christoph Lameter <cl@linux-foundation.org>
Description:
- The shrink file is written when memory should be reclaimed from
- a cache. Empty partial slabs are freed and the partial list is
- sorted so the slabs with the fewest available objects are used
- first.
+ The shrink file is used to reclaim unused slab cache
+ memory from a cache. Empty per-cpu or partial slabs
+ are freed and the partial list is sorted so the slabs
+ with the fewest available objects are used first.
+ It only accepts a value of "1" on write for shrinking
+ the cache. Other input values are considered invalid.
+ Shrinking slab caches might be expensive and can
+ adversely impact other running applications. So it
+ should be used with care.
What: /sys/kernel/slab/cache/slab_size
Date: May 2007
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3deacdc5e6d2..aa1789f269bd 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -615,8 +615,8 @@ on an IO device and is an example of this type.
Protections
-----------
-A cgroup is protected to be allocated upto the configured amount of
-the resource if the usages of all its ancestors are under their
+A cgroup is protected upto the configured amount of the resource
+as long as the usages of all its ancestors are under their
protected levels. Protections can be hard guarantees or best effort
soft boundaries. Protections can also be over-committed in which case
only upto the amount available to the parent is protected among
@@ -1062,7 +1062,10 @@ PAGE_SIZE multiple when read back.
is within its effective min boundary, the cgroup's memory
won't be reclaimed under any conditions. If there is no
unprotected reclaimable memory available, OOM killer
- is invoked.
+ is invoked. Above the effective min boundary (or
+ effective low boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective min boundary is limited by memory.min values of
all ancestor cgroups. If there is memory.min overcommitment
@@ -1084,7 +1087,10 @@ PAGE_SIZE multiple when read back.
Best-effort memory protection. If the memory usage of a
cgroup is within its effective low boundary, the cgroup's
memory won't be reclaimed unless memory can be reclaimed
- from unprotected cgroups.
+ from unprotected cgroups. Above the effective low boundary (or
+ effective min boundary if it is higher), pages are reclaimed
+ proportionally to the overage, reducing reclaim pressure for
+ smaller overages.
Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
@@ -2448,8 +2454,10 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.
The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it's within its low,
-which makes delegation of subtrees possible.
+reserve. A cgroup enjoys reclaim protection when it's within its
+effective low, which makes delegation of subtrees possible. It also
+enjoys having reclaim pressure proportional to its overage when
+above its effective low.
The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a878a6525321..cad6070db4f4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -809,6 +809,8 @@
enables the feature at boot time. By default, it is
disabled and the system will work mostly the same as a
kernel built without CONFIG_DEBUG_PAGEALLOC.
+ Note: to get most of debug_pagealloc error reports, it's
+ useful to also enable the page_owner functionality.
on: enable the feature
debugpat [X86] Enable PAT debugging
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 032c7cd3cede..2e36620ec1e4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -45,6 +45,7 @@ show up in /proc/sys/kernel:
- hung_task_timeout_secs
- hung_task_check_interval_secs
- hung_task_warnings
+- hung_task_interval_warnings
- hyperv_record_panic_msg
- kexec_load_disabled
- kptr_restrict
@@ -383,14 +384,29 @@ Possible values to set are in range {0..LONG_MAX/HZ}.
hung_task_warnings:
===================
-The maximum number of warnings to report. During a check interval
-if a hung task is detected, this value is decreased by 1.
+The maximum number of warnings to report. If after timeout a hung
+task is present, this value is decreased by 1 every check interval,
+producing a warning.
When this value reaches 0, no more warnings will be reported.
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-1: report an infinite number of warnings.
+hung_task_interval_warnings:
+===================
+
+The same as hung_task_warnings, but set the number of interval
+warnings to be issued about detected hung tasks during check
+interval. That will produce warnings *before* the timeout happens.
+If a hung task is detected during check interval, this value is
+decreased by 1. When this value reaches 0, only timeout warnings
+will be reported.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+-1: report an infinite number of check interval warnings.
+
+
hyperv_record_panic_msg:
========================
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
index 6b38a39fab24..a534cc7ebd05 100644
--- a/Documentation/core-api/genalloc.rst
+++ b/Documentation/core-api/genalloc.rst
@@ -129,7 +129,7 @@ writing of special-purpose memory allocators in the future.
:functions: gen_pool_for_each_chunk
.. kernel-doc:: lib/genalloc.c
- :functions: addr_in_gen_pool
+ :functions: gen_pool_has_addr
.. kernel-doc:: lib/genalloc.c
:functions: gen_pool_avail
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 08af5caf036d..f77de49b1d51 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -42,6 +42,9 @@ String Manipulation
.. kernel-doc:: lib/string.c
:export:
+.. kernel-doc:: include/linux/string.h
+ :internal:
+
.. kernel-doc:: mm/util.c
:functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user
vmemdup_user strndup_user memdup_user_nul
diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 7744aa3bf2e0..27c54854b508 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel
configuration, but it is a good practice to use `kmalloc` for objects
smaller than page size.
+The address of a chunk allocated with `kmalloc` is aligned to at least
+ARCH_KMALLOC_MINALIGN bytes. For sizes of power of two bytes, the
+alignment is also guaranteed to be at least to the respective size.
+
For large allocations you can use :c:func:`vmalloc` and
:c:func:`vzalloc`, or directly request pages from the page
allocator. The memory allocated by `vmalloc` and related functions is
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 99ca040e3f90..93fc183641e6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -968,8 +968,8 @@ ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
with huge pages
ShmemPmdMapped: Shared memory mapped into userspace with huge pages
KReclaimable: Kernel allocations that the kernel will attempt to reclaim
- under memory pressure. Includes SReclaimable (below), and other
- direct allocations with a shrinker.
+ under memory pressure. Includes SReclaimable (below), deferred
+ split THPs, and other direct allocations with a shrinker.
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/process/deprecated.rst b/Documentation/process/deprecated.rst
index 053b24a6dd38..56280e108d5a 100644
--- a/Documentation/process/deprecated.rst
+++ b/Documentation/process/deprecated.rst
@@ -84,7 +84,7 @@ buffer. This could result in linear overflows beyond the
end of the buffer, leading to all kinds of misbehaviors. While
`CONFIG_FORTIFY_SOURCE=y` and various compiler flags help reduce the
risk of using this function, there is no good reason to add new uses of
-this function. The safe replacement is :c:func:`strscpy`.
+this function. The safe replacement is stracpy() or strscpy().
strncpy() on NUL-terminated strings
-----------------------------------
@@ -93,9 +93,9 @@ will be NUL terminated. This can lead to various linear read overflows
and other misbehavior due to the missing termination. It also NUL-pads the
destination buffer if the source contents are shorter than the destination
buffer size, which may be a needless performance penalty for callers using
-only NUL-terminated strings. The safe replacement is :c:func:`strscpy`.
-(Users of :c:func:`strscpy` still needing NUL-padding will need an
-explicit :c:func:`memset` added.)
+only NUL-terminated strings. In this case, the safe replacement is
+stracpy() or strscpy(). If, however, the destination buffer still needs
+NUL-padding, the safe replacement is stracpy_pad().
If a caller is using non-NUL-terminated strings, :c:func:`strncpy()` can
still be used, but destinations should be marked with the `__nonstring
@@ -107,7 +107,7 @@ strlcpy()
:c:func:`strlcpy` reads the entire source buffer first, possibly exceeding
the given limit of bytes to copy. This is inefficient and can lead to
linear read overflows if a source string is not NUL-terminated. The
-safe replacement is :c:func:`strscpy`.
+safe replacement is stracpy() or strscpy().
Variable Length Arrays (VLAs)
-----------------------------
diff --git a/arch/Kconfig b/arch/Kconfig
index 95998f872191..6728c5fa057e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -703,6 +703,17 @@ config HAVE_ARCH_COMPAT_MMAP_BASES
and vice-versa 32-bit applications to call 64-bit mmap().
Required for applications doing different bitness syscalls.
+# This allows to use a set of generic functions to determine mmap base
+# address by giving priority to top-down scheme only if the process
+# is not in legacy mode (compat task, unlimited stack size or
+# sysctl_legacy_va_layout).
+# Architecture that selects this option can provide its own version of:
+# - STACK_RND_MASK
+config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+ bool
+ depends on MMU
+ select ARCH_HAS_ELF_RANDOMIZE
+
config HAVE_COPY_THREAD_TLS
bool
help
diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index 71ded3b7d82d..eb91f1e85629 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h
@@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd)
free_page((unsigned long)pmd);
}
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ALPHA_PGALLOC_H */
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 89c2032f9960..065b57f408c3 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -359,11 +359,6 @@ extern void paging_init(void);
#include <asm-generic/pgtable.h>
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index ac23379b7a87..a18ec7f63888 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -68,6 +68,9 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index 9bdb8ed5b0db..4751f2251cd9 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@@ -129,7 +129,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
-#define check_pgt_cache() do { } while (0)
#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
#endif /* _ASM_ARC_PGALLOC_H */
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 1d87c18a2976..7addd0301c51 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
/* to cope with aliasing VIPT cache */
#define HAVE_ARCH_UNMAPPED_AREA
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a36c530bab92..a98c7af50bf0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -34,6 +34,7 @@ config ARM
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_IPC_PARSE_VERSION
select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
select BUILDTIME_EXTABLE_SORT if MMU
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index a2a68b751971..069da393110c 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -15,8 +15,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#define check_pgt_cache() do { } while (0)
-
#ifdef CONFIG_MMU
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index d0de24f06724..010fa1a35a68 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -71,11 +71,6 @@ typedef pte_t *pte_addr_t;
extern unsigned int kobjsize(const void *objp);
/*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init() do { } while (0)
-
-/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
*/
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index f2e990dc27e7..3ae120cd1715 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* CONFIG_MMU */
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 20c2f42454b8..614bf829e454 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr)
#endif
#endif
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
#endif
#endif /* __ASM_ARM_PROCESSOR_H */
diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h
index 2c403e7c782d..ea39cb724ffa 100644
--- a/arch/arm/include/asm/xen/page-coherent.h
+++ b/arch/arm/include/asm/xen/page-coherent.h
@@ -31,8 +31,7 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
{
unsigned long page_pfn = page_to_xen_pfn(page);
unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr);
- unsigned long compound_pages =
- (1<<compound_order(page)) * XEN_PFN_PER_PAGE;
+ unsigned long compound_pages = compound_nr(page) * XEN_PFN_PER_PAGE;
bool local = (page_pfn <= dev_pfn) &&
(dev_pfn - page_pfn < compound_pages);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index f934a6739fc0..9485acc520a4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- return randomize_page(mm->brk, 0x02000000);
-}
-
#ifdef CONFIG_MMU
#ifdef CONFIG_KUSER_HELPERS
/*
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index d27b12f61737..58e47c236c9e 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -544,7 +544,7 @@ static void *__alloc_from_pool(size_t size, struct page **ret_page)
static bool __in_atomic_pool(void *start, size_t size)
{
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
static int __free_from_pool(void *start, size_t size)
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 6ecbda87ee46..6d89db7895d1 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -204,18 +204,17 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page)
* coherent with the kernels mapping.
*/
if (!PageHighMem(page)) {
- size_t page_size = PAGE_SIZE << compound_order(page);
- __cpuc_flush_dcache_area(page_address(page), page_size);
+ __cpuc_flush_dcache_area(page_address(page), page_size(page));
} else {
unsigned long i;
if (cache_is_vipt_nonaliasing()) {
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
void *addr = kmap_atomic(page + i);
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
kunmap_atomic(addr);
}
} else {
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
void *addr = kmap_high_get(page + i);
if (addr) {
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index f866870db749..b8d912ac9e61 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -17,33 +17,6 @@
((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \
(((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
/*
* We need to ensure that shared mappings are correctly aligned to
* avoid aliasing issues with VIPT caches. We need to ensure that
@@ -171,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
- return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
/*
* You really shouldn't be using read() or write() on /dev/mem. This
* might go away in the future.
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 810e11389238..6b6362b83004 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -15,7 +15,6 @@ config ARM64
select ARCH_HAS_DMA_COHERENT_TO_PFN
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
- select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -71,6 +70,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 14d0bc44d451..172d76fa0245 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -15,8 +15,6 @@
#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
-#define check_pgt_cache() do { } while (0)
-
#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
#if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 57427d17580e..7576df00eb50 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr);
#include <asm-generic/pgtable.h>
-static inline void pgtable_cache_init(void) { }
-
/*
* On AArch64, the cache coherency is handled via the set_pte_at() function.
*/
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c67848c55009..5623685c7d13 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr)
"nop") : : "p" (ptr));
}
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
extern void __init minsigstksz_setup(void);
diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h
index d88e56b90b93..b600a8ef3349 100644
--- a/arch/arm64/include/asm/xen/page-coherent.h
+++ b/arch/arm64/include/asm/xen/page-coherent.h
@@ -45,8 +45,7 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
{
unsigned long page_pfn = page_to_xen_pfn(page);
unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr);
- unsigned long compound_pages =
- (1<<compound_order(page)) * XEN_PFN_PER_PAGE;
+ unsigned long compound_pages = compound_nr(page) * XEN_PFN_PER_PAGE;
bool local = (page_pfn <= dev_pfn) &&
(dev_pfn - page_pfn < compound_pages);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 03689c0beb34..a47462def04b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp)
return sp & ~0xf;
}
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- if (is_compat_task())
- return randomize_page(mm->brk, SZ_32M);
- else
- return randomize_page(mm->brk, SZ_1G);
-}
-
/*
* Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY.
*/
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index dc19300309d2..ac485163a4a7 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte)
struct page *page = pte_page(pte);
if (!test_and_set_bit(PG_dcache_clean, &page->flags))
- sync_icache_aliases(page_address(page),
- PAGE_SIZE << compound_order(page));
+ sync_icache_aliases(page_address(page), page_size(page));
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index b050641b5139..3028bacbc4e9 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -21,78 +21,6 @@
#include <asm/cputype.h>
/*
- * Leave enough space between the mmap area and the stack to honour ulimit in
- * the face of randomisation.
- */
-#define MIN_GAP (SZ_128M)
-#define MAX_GAP (STACK_TOP/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
- if (test_thread_flag(TIF_32BIT))
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
- else
-#endif
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
- return rnd << PAGE_SHIFT;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
- unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
-
- /* Values close to RLIM_INFINITY can overflow. */
- if (gap + pad > gap)
- gap += pad;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(STACK_TOP - gap - rnd);
-}
-
-/*
- * This function, called very early during the creation of a new process VM
- * image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- /*
- * Fall back to the standard layout if the personality bit is set, or
- * if the expected stack growth is unlimited:
- */
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
-/*
* You really shouldn't be using read() or write() on /dev/mem. This might go
* away in the future.
*/
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 7548f9ca1f11..4a64089e5771 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
{
if (PGD_SIZE == PAGE_SIZE)
return;
diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h
index 0bd805964ea6..0b6919c00413 100644
--- a/arch/c6x/include/asm/pgtable.h
+++ b/arch/c6x/include/asm/pgtable.h
@@ -60,11 +60,6 @@ extern unsigned long empty_zero_page;
#define swapper_pg_dir ((pgd_t *) 0)
/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
-/*
* c6x is !MMU, so define the simpliest implementation
*/
#define pgprot_writecombine pgprot_noncached
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 98c5716708d6..d089113fe41f 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -75,8 +75,6 @@ do { \
tlb_remove_page(tlb, pte); \
} while (0)
-#define check_pgt_cache() do {} while (0)
-
extern void pagetable_init(void);
extern void pre_mmu_init(void);
extern void pre_trap_init(void);
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index fc19ba446d62..7c21985c60dc 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -306,11 +306,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
#define kern_addr_valid(addr) (1)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do {} while (0)
-
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h
index a99caa49d265..4d00152fab58 100644
--- a/arch/h8300/include/asm/pgtable.h
+++ b/arch/h8300/include/asm/pgtable.h
@@ -4,7 +4,6 @@
#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopud.h>
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
extern void paging_init(void);
#define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */
#define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */
@@ -35,11 +34,6 @@ extern unsigned int kobjsize(const void *objp);
extern int is_in_rom(unsigned long);
/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
-/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
*/
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index d6544dc71258..5a6e79e7926d 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -13,8 +13,6 @@
#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
-#define check_pgt_cache() do {} while (0)
-
extern unsigned long long kmap_generation;
/*
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index a3ff6d24c09e..2fec20ad939e 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte)
#define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-/* I think this is in case we have page table caches; needed by init/main.c */
-#define pgtable_cache_init() do { } while (0)
-
/*
* Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is
* interpreted as swap information. The remaining free bits are interpreted as
diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile
index 1894263ae5bc..893838499591 100644
--- a/arch/hexagon/mm/Makefile
+++ b/arch/hexagon/mm/Makefile
@@ -3,5 +3,5 @@
# Makefile for Hexagon memory management subsystem
#
-obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o
+obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o
obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o
diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c
deleted file mode 100644
index 4d4316140237..000000000000
--- a/arch/hexagon/mm/pgalloc.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
- */
-
-#include <linux/init.h>
-
-void __init pgtable_cache_init(void)
-{
-}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 5dbd7fd7578f..46ae956fdbcc 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -72,10 +72,6 @@ config 64BIT
config ZONE_DMA32
def_bool y
-config QUICKLIST
- bool
- default y
-
config MMU
bool
default y
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index c9e481023c25..f4c491044882 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -19,18 +19,19 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/threads.h>
-#include <linux/quicklist.h>
+
+#include <asm-generic/pgalloc.h>
#include <asm/mmu_context.h>
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- quicklist_free(0, NULL, pgd);
+ free_page((unsigned long)pgd);
}
#if CONFIG_PGTABLE_LEVELS == 4
@@ -42,12 +43,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
- quicklist_free(0, NULL, pud);
+ free_page((unsigned long)pud);
}
#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
@@ -60,12 +61,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
+ return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
}
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
- quicklist_free(0, NULL, pmd);
+ free_page((unsigned long)pmd);
}
#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd)
@@ -83,43 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
pmd_val(*pmd_entry) = __pa(pte);
}
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
- struct page *page;
- void *pg;
-
- pg = quicklist_alloc(0, GFP_KERNEL, NULL);
- if (!pg)
- return NULL;
- page = virt_to_page(pg);
- if (!pgtable_page_ctor(page)) {
- quicklist_free(0, NULL, pg);
- return NULL;
- }
- return page;
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- return quicklist_alloc(0, GFP_KERNEL, NULL);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- quicklist_free_page(0, NULL, pte);
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- quicklist_free(0, NULL, pte);
-}
-
-static inline void check_pgt_cache(void)
-{
- quicklist_trim(0, NULL, 25, 16);
-}
-
#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)
#endif /* _ASM_IA64_PGALLOC_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index b1e7468eb65a..d602e7c622db 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr;
#define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M
#define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
/* These tell get_user_pages() that the first gate page is accessible from user-level. */
#define FIXADDR_USER_START GATE_ADDR
#ifdef HAVE_BUGGY_SEGREL
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 678b98a09c85..bf9df2625bc8 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte)
if (test_bit(PG_arch_1, &page->flags))
return; /* i-cache is already coherent with d-cache */
- flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page)));
+ flush_icache_range(addr, addr + page_size(page));
set_bit(PG_arch_1, &page->flags); /* mark page as clean */
}
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index fde4534b974f..646c174fff99 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -176,11 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot);
#include <asm-generic/pgtable.h>
#endif /* !__ASSEMBLY__ */
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
-#define check_pgt_cache() do { } while (0)
-
#endif /* _M68K_PGTABLE_H */
diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index fc3a96c77bd8..c18165b0d904 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h
@@ -45,11 +45,6 @@ extern void paging_init(void);
#define ZERO_PAGE(vaddr) (virt_to_page(0))
/*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init() do { } while (0)
-
-/*
* All 32bit addresses are effectively valid for vmalloc...
* Sort of meaningless for non-VM targets.
*/
@@ -60,6 +55,4 @@ extern void paging_init(void);
#include <asm-generic/pgtable.h>
-#define check_pgt_cache() do { } while (0)
-
#endif /* _M68KNOMMU_PGTABLE_H */
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index f4cc9ffc449e..7ecb05baa601 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@@ -21,83 +21,23 @@
#include <asm/cache.h>
#include <asm/pgtable.h>
-#define PGDIR_ORDER 0
-
-/*
- * This is handled very differently on MicroBlaze since out page tables
- * are all 0's and I want to be able to use these zero'd pages elsewhere
- * as well - it gives us quite a speedup.
- * -- Cort
- */
-extern struct pgtable_cache_struct {
- unsigned long *pgd_cache;
- unsigned long *pte_cache;
- unsigned long pgtable_cache_sz;
-} quicklists;
-
-#define pgd_quicklist (quicklists.pgd_cache)
-#define pmd_quicklist ((unsigned long *)0)
-#define pte_quicklist (quicklists.pte_cache)
-#define pgtable_cache_size (quicklists.pgtable_cache_sz)
-
-extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */
-extern atomic_t zero_sz; /* # currently pre-zero'd pages */
-extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */
-extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */
-extern atomic_t zerototal; /* # pages zero'd over time */
-
-#define zero_quicklist (zero_cache)
-#define zero_cache_sz (zero_sz)
-#define zero_cache_calls (zeropage_calls)
-#define zero_cache_hits (zeropage_hits)
-#define zero_cache_total (zerototal)
-
-/*
- * return a pre-zero'd page from the list,
- * return NULL if none available -- Cort
- */
-extern unsigned long get_zero_page_fast(void);
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#include <asm-generic/pgalloc.h>
extern void __bad_pte(pmd_t *pmd);
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *get_pgd(void)
{
- pgd_t *ret;
-
- ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER);
- if (ret != NULL)
- clear_page(ret);
- return ret;
+ return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
}
-static inline pgd_t *get_pgd_fast(void)
-{
- unsigned long *ret;
-
- ret = pgd_quicklist;
- if (ret != NULL) {
- pgd_quicklist = (unsigned long *)(*ret);
- ret[0] = 0;
- pgtable_cache_size--;
- } else
- ret = (unsigned long *)get_pgd_slow();
- return (pgd_t *)ret;
-}
-
-static inline void free_pgd_fast(pgd_t *pgd)
-{
- *(unsigned long **)pgd = pgd_quicklist;
- pgd_quicklist = (unsigned long *) pgd;
- pgtable_cache_size++;
-}
-
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void free_pgd(pgd_t *pgd)
{
free_page((unsigned long)pgd);
}
-#define pgd_free(mm, pgd) free_pgd_fast(pgd)
-#define pgd_alloc(mm) get_pgd_fast()
+#define pgd_free(mm, pgd) free_pgd(pgd)
+#define pgd_alloc(mm) get_pgd()
#define pmd_pgtable(pmd) pmd_page(pmd)
@@ -110,50 +50,6 @@ static inline void free_pgd_slow(pgd_t *pgd)
extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
-static inline struct page *pte_alloc_one(struct mm_struct *mm)
-{
- struct page *ptepage;
-
-#ifdef CONFIG_HIGHPTE
- int flags = GFP_KERNEL | __GFP_HIGHMEM;
-#else
- int flags = GFP_KERNEL;
-#endif
-
- ptepage = alloc_pages(flags, 0);
- if (!ptepage)
- return NULL;
- clear_highpage(ptepage);
- if (!pgtable_page_ctor(ptepage)) {
- __free_page(ptepage);
- return NULL;
- }
- return ptepage;
-}
-
-static inline void pte_free_fast(pte_t *pte)
-{
- *(unsigned long **)pte = pte_quicklist;
- pte_quicklist = (unsigned long *) pte;
- pgtable_cache_size++;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free_slow(struct page *ptepage)
-{
- __free_page(ptepage);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
-{
- pgtable_page_dtor(ptepage);
- __free_page(ptepage);
-}
-
#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte))
#define pmd_populate(mm, pmd, pte) \
@@ -171,10 +67,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x)
#define pgd_populate(mm, pmd, pte) BUG()
-extern int do_check_pgt_cache(int, int);
-
#endif /* CONFIG_MMU */
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ASM_MICROBLAZE_PGALLOC_H */
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 142d3f004848..954b69af451f 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -46,8 +46,6 @@ extern int mem_init_done;
#define swapper_pg_dir ((pgd_t *) NULL)
-#define pgtable_cache_init() do {} while (0)
-
#define arch_enter_lazy_cpu_mode() do {} while (0)
#define pgprot_noncached_wc(prot) prot
@@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr);
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
#define kern_addr_valid(addr) (1)
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
void do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 8fe54fda31dc..010bb9cee2e4 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -44,10 +44,6 @@ unsigned long ioremap_base;
unsigned long ioremap_bot;
EXPORT_SYMBOL(ioremap_bot);
-#ifndef CONFIG_SMP
-struct pgtable_cache_struct quicklists;
-#endif
-
static void __iomem *__ioremap(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 35394eda08ac..fe354d97c22e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -5,7 +5,6 @@ config MIPS
select ARCH_32BIT_OFF_T if !64BIT
select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
select ARCH_CLOCKSOURCE_DATA
- select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_SUPPORTS_UPROBES
@@ -13,6 +12,7 @@ config MIPS
select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index aa16b85ddffc..aa73cb187a07 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
#endif /* __PAGETABLE_PUD_FOLDED */
-#define check_pgt_cache() do { } while (0)
-
extern void pagetable_init(void);
#endif /* _ASM_PGALLOC_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index d60f47a9088c..48446e7c83b1 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* _ASM_PGTABLE_H */
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index aca909bd7841..fba18d4a9190 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -29,11 +29,6 @@
extern unsigned int vced_count, vcei_count;
-/*
- * MIPS does have an arch_pick_mmap_layout()
- */
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
#ifdef CONFIG_32BIT
#ifdef CONFIG_KVM_GUEST
/* User space process size is limited to 1GB in KVM Guest Mode */
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index c2b40969eb1f..57dc2ac4f8bd 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -95,6 +95,9 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d79f2b432318..00fe90c6db3e 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -20,33 +20,6 @@
unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */
EXPORT_SYMBOL(shm_align_mask);
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
- if (current->personality & ADDR_COMPAT_LAYOUT)
- return 1;
-
- if (rlim_stack->rlim_cur == RLIM_INFINITY)
- return 1;
-
- return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
- unsigned long gap = rlim_stack->rlim_cur;
-
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
-
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
#define COLOUR_ALIGN(addr, pgoff) \
((((addr) + shm_align_mask) & ~shm_align_mask) + \
(((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -144,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
addr0, len, pgoff, flags, DOWN);
}
-unsigned long arch_mmap_rnd(void)
-{
- unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
- if (TASK_IS_32BIT_ADDR)
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
- else
-#endif /* CONFIG_COMPAT */
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
- return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
- unsigned long random_factor = 0UL;
-
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
-
- if (mmap_is_legacy(rlim_stack)) {
- mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- }
-}
-
-static inline unsigned long brk_rnd(void)
-{
- unsigned long rnd = get_random_long();
-
- rnd = rnd << PAGE_SHIFT;
- /* 8MB for 32bit, 256MB for 64bit */
- if (TASK_IS_32BIT_ADDR)
- rnd = rnd & 0x7ffffful;
- else
- rnd = rnd & 0xffffffful;
-
- return rnd;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long base = mm->brk;
- unsigned long ret;
-
- ret = PAGE_ALIGN(base + brk_rnd());
-
- if (ret < mm->brk)
- return mm->brk;
-
- return ret;
-}
-
bool __virt_addr_valid(const volatile void *kaddr)
{
unsigned long vaddr = (unsigned long)kaddr;
diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h
index e78b43d8389f..37125e6884d7 100644
--- a/arch/nds32/include/asm/pgalloc.h
+++ b/arch/nds32/include/asm/pgalloc.h
@@ -23,8 +23,6 @@
extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
-#define check_pgt_cache() do { } while (0)
-
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
pgtable_t pte;
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index c70cc56bec09..0588ec99725c 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h
@@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
* into virtual address `from'
*/
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASMNDS32_PGTABLE_H */
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 4bc8cf72067e..750d18d5980b 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -45,6 +45,4 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
tlb_remove_page((tlb), (pte)); \
} while (0)
-#define check_pgt_cache() do { } while (0)
-
#endif /* _ASM_NIOS2_PGALLOC_H */
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 95237b7f6fc1..99985d8b7166 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm,
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
-
extern void __init paging_init(void);
extern void __init mmu_init(void);
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 3d4b397c2d06..787c1b9d2f6d 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -101,6 +101,4 @@ do { \
#define pmd_pgtable(pmd) pmd_page(pmd)
-#define check_pgt_cache() do { } while (0)
-
#endif
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 497fd908a4c4..8643809d0bcf 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#include <asm-generic/pgtable.h>
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
typedef pte_t *pte_addr_t;
#endif /* __ASSEMBLY__ */
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index 4f2059a50fae..d98647c29b74 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
pmd_populate_kernel(mm, pmd, page_address(pte_page))
#define pmd_pgtable(pmd) pmd_page(pmd)
-#define check_pgt_cache() do { } while (0)
-
#endif
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 6d58c1739b42..4ac374b3a99f 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
#define PTRS_PER_PTE (1UL << BITS_PER_PTE)
/* Definitions for 2nd level */
-#define pgtable_cache_init() do { } while (0)
-
#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE)
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index c98162f494db..6fd8871e4081 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -48,6 +48,9 @@
#define MADV_DONTFORK 10 /* don't inherit across fork */
#define MADV_DOFORK 11 /* do inherit across fork */
+#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
+
#define MADV_MERGEABLE 65 /* KSM may merge identical pages */
#define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 2b2c60a1a66d..6dd78a2dc03a 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
extern struct kmem_cache *pgtable_cache[];
#define PGT_CACHE(shift) pgtable_cache[shift]
-static inline void check_pgt_cache(void) { }
-
#ifdef CONFIG_PPC_BOOK3S
#include <asm/book3s/pgalloc.h>
#else
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8b7865a2d576..4053b2ab427c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -87,7 +87,6 @@ extern unsigned long ioremap_bot;
unsigned long vmalloc_to_phys(void *vmalloc_addr);
void pgtable_cache_add(unsigned int shift);
-void pgtable_cache_init(void);
#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
void mark_initmem_nx(void);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index c3bfef08dcf8..1ce223cb19f9 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1757,7 +1757,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
- * split_huge_page_pmd
+ * split_huge_pmd
*/
if (!hpte_slot_array)
return;
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index b056cae3388b..56cc84520577 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
* Allow to use larger than 64k IOMMU pages. Only do that
* if we are backed by hugetlb.
*/
- if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
- struct page *head = compound_head(page);
-
- pageshift = compound_order(head) + PAGE_SHIFT;
- }
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+ pageshift = page_shift(compound_head(page));
mem->pageshift = min(mem->pageshift, pageshift);
/*
* We don't need struct page reference any more, switch
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a8953f108808..73d4873fc7f8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page)
BUG_ON(!PageCompound(page));
- for (i = 0; i < (1UL << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (!PageHighMem(page)) {
__flush_dcache_icache(page_address(page+i));
} else {
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 254e27ab3c5b..abc7e3b26427 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -56,6 +56,18 @@ config RISCV
select EDAC_SUPPORT
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+ select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
+ select HAVE_ARCH_MMAP_RND_BITS
+
+config ARCH_MMAP_RND_BITS_MIN
+ default 18 if 64BIT
+ default 8
+
+# max bits determined by the following formula:
+# VA_BITS - PAGE_SHIFT - 3
+config ARCH_MMAP_RND_BITS_MAX
+ default 24 if 64BIT # SV39 based
+ default 17
config MMU
def_bool y
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 56a67d66f72f..f66a00d8cb19 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -82,8 +82,4 @@ do { \
tlb_remove_page((tlb), pte); \
} while (0)
-static inline void check_pgt_cache(void)
-{
-}
-
#endif /* _ASM_RISCV_PGALLOC_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index c24a083b3e12..cf26a9c2e377 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -411,11 +411,6 @@ extern void *dtb_early_va;
extern void setup_bootmem(void);
extern void paging_init(void);
-static inline void pgtable_cache_init(void)
-{
- /* No page table caches to initialize */
-}
-
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END (PAGE_OFFSET - 1)
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0c4600725fc2..36c578c0ff96 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1682,12 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-/*
- * No page table caches to initialise
- */
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
-
#include <asm-generic/pgtable.h>
#endif /* _S390_PAGE_H */
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index b56f908b1395..8c6341a4d807 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -2,10 +2,8 @@
#ifndef __ASM_SH_PGALLOC_H
#define __ASM_SH_PGALLOC_H
-#include <linux/quicklist.h>
#include <asm/page.h>
-
-#define QUICK_PT 0 /* Other page table pages that are zero on free */
+#include <asm-generic/pgalloc.h>
extern pgd_t *pgd_alloc(struct mm_struct *);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
@@ -29,41 +27,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
}
#define pmd_pgtable(pmd) pmd_page(pmd)
-/*
- * Allocate and free page tables.
- */
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
- struct page *page;
- void *pg;
-
- pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
- if (!pg)
- return NULL;
- page = virt_to_page(pg);
- if (!pgtable_page_ctor(page)) {
- quicklist_free(QUICK_PT, NULL, pg);
- return NULL;
- }
- return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- quicklist_free(QUICK_PT, NULL, pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- quicklist_free_page(QUICK_PT, NULL, pte);
-}
-
#define __pte_free_tlb(tlb,pte,addr) \
do { \
pgtable_page_dtor(pte); \
@@ -79,9 +42,4 @@ do { \
} while (0);
#endif
-static inline void check_pgt_cache(void)
-{
- quicklist_trim(QUICK_PT, NULL, 25, 16);
-}
-
#endif /* __ASM_SH_PGALLOC_H */
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 9085d1142fa3..cbd0f3c55a0c 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t;
#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
-/*
- * Initialise the page table caches
- */
-extern void pgtable_cache_init(void);
-
struct vm_area_struct;
struct mm_struct;
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 02ed2df25a54..5c8a2ebfc720 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -1,9 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
menu "Memory management options"
-config QUICKLIST
- def_bool y
-
config MMU
bool "Support for memory management hardware"
depends on !CPU_SH2
diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c
index cc779a90d917..dca946f426c6 100644
--- a/arch/sh/mm/nommu.c
+++ b/arch/sh/mm/nommu.c
@@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
}
-
-void pgtable_cache_init(void)
-{
-}
diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h
index 282be50a4adf..10538a4d1a1e 100644
--- a/arch/sparc/include/asm/pgalloc_32.h
+++ b/arch/sparc/include/asm/pgalloc_32.h
@@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size);
extern struct resource sparc_iomap;
-#define check_pgt_cache() do { } while (0)
-
pgd_t *get_pgd_fast(void);
static inline void free_pgd_fast(pgd_t *pgd)
{
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 48abccba4991..9d3e5cc95bbb 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage);
#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD))
-#define check_pgt_cache() do { } while (0)
-
void pgtable_free(void *table, bool is_page);
#ifdef CONFIG_SMP
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 4eebed6c6781..31da44826645 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
/* We provide our own get_unmapped_area to cope with VA holes for userland */
#define HAVE_ARCH_UNMAPPED_AREA
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !(_SPARC_PGTABLE_H) */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1599de730532..b57f9c631eca 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1135,7 +1135,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long,
unsigned long);
#define HAVE_ARCH_FB_UNMAPPED_AREA
-void pgtable_cache_init(void);
void sun4v_register_fault_status(void);
void sun4v_ktsb_register(void);
void __init cheetah_ecache_flush_init(void);
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 046ab116cc8c..906eda1158b4 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -31,7 +31,6 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/vaddrs.h>
-#include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */
#include <asm/setup.h>
#include <asm/tlb.h>
#include <asm/prom.h>
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index d7b282e9c4d5..7104b055df5e 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x))
#endif
-#define check_pgt_cache() do { } while (0)
-
#endif
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index b377df76cc28..eb202e505e8c 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
/* zero page used for uninitialized stuff */
extern unsigned long *empty_zero_page;
-#define pgtable_cache_init() do ; while (0)
-
/* Just any arbitrary offset to the start of the vmalloc VM area: the
* current 8MB value just means that there will be a 8MB "hole" after the
* physical memory until the kernel virtual memory starts. That means that
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 3f0903bd98e9..ba1c9a79993b 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -18,8 +18,6 @@
#define __HAVE_ARCH_PTE_ALLOC_ONE
#include <asm-generic/pgalloc.h>
-#define check_pgt_cache() do { } while (0)
-
#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
index 126e961a8cb0..c8f7ba12f309 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h
@@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
#include <asm-generic/pgtable.h>
-#define pgtable_cache_init() do { } while (0)
-
#endif /* !__ASSEMBLY__ */
#endif /* __UNICORE_PGTABLE_H__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index c78da8eda8f2..0dca7f7aeff2 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -29,8 +29,6 @@ extern pgd_t swapper_pg_dir[1024];
extern pgd_t initial_page_table[1024];
extern pmd_t initial_pg_pmd[];
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
void paging_init(void);
void sync_initial_page_table(void);
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4990d26dfc73..0b6c4042942a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -241,9 +241,6 @@ extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#define pgtable_cache_init() do { } while (0)
-#define check_pgt_cache() do { } while (0)
-
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index fa16036fa592..65ebe4b88f7c 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node)
return ret;
}
-static u64 compute_subtree_max_end(struct memtype *data)
-{
- u64 max_end = data->end, child_max_end;
-
- child_max_end = get_subtree_max_end(data->rb.rb_right);
- if (child_max_end > max_end)
- max_end = child_max_end;
-
- child_max_end = get_subtree_max_end(data->rb.rb_left);
- if (child_max_end > max_end)
- max_end = child_max_end;
-
- return max_end;
-}
+#define NODE_END(node) ((node)->end)
-RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
- u64, subtree_max_end, compute_subtree_max_end)
+RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb,
+ struct memtype, rb, u64, subtree_max_end, NODE_END)
/* Find the first (lowest start addr) overlapping range from rb tree */
static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 44816ff6411f..463940faf52f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
static struct kmem_cache *pgd_cache;
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
{
/*
* When PAE kernel is running as a Xen domain, it does not use
@@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd)
}
#else
-void __init pgd_cache_init(void)
-{
-}
-
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index ce3ff5e591b9..3f7fe5a8c286 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -238,7 +238,6 @@ extern void paging_init(void);
# define swapper_pg_dir NULL
static inline void paging_init(void) { }
#endif
-static inline void pgtable_cache_init(void) { }
/*
* The pmd contains the kernel virtual address of the pte page.
diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h
index 06875feb27c2..856e2da2e397 100644
--- a/arch/xtensa/include/asm/tlbflush.h
+++ b/arch/xtensa/include/asm/tlbflush.h
@@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address)
invalidate_dtlb_entry(tlb_entry);
}
-#define check_pgt_cache() do { } while (0)
-
-
/*
* DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa
* ISA and exist only for test purposes..
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index ebbb48842190..e5e643752947 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,6 +103,9 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1bcef8..6bea4f3f8040 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -100,26 +100,9 @@ unsigned long __weak memory_block_size_bytes(void)
}
EXPORT_SYMBOL_GPL(memory_block_size_bytes);
-static unsigned long get_memory_block_size(void)
-{
- unsigned long block_sz;
-
- block_sz = memory_block_size_bytes();
-
- /* Validate blk_sz is a power of 2 and not less than section size */
- if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
- WARN_ON(1);
- block_sz = MIN_MEMORY_BLOCK_SIZE;
- }
-
- return block_sz;
-}
-
/*
- * use this as the physical section index that this memsection
- * uses.
+ * Show the first physical section index (number) of this memory block.
*/
-
static ssize_t phys_index_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -131,7 +114,10 @@ static ssize_t phys_index_show(struct device *dev,
}
/*
- * Show whether the section of memory is likely to be hot-removable
+ * Show whether the memory block is likely to be offlineable (or is already
+ * offline). Once offline, the memory block could be removed. The return
+ * value does, however, not indicate that there is a way to remove the
+ * memory block.
*/
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
char *buf)
@@ -455,12 +441,12 @@ static DEVICE_ATTR_RO(phys_device);
static DEVICE_ATTR_RO(removable);
/*
- * Block size attribute stuff
+ * Show the memory block size (shared by all memory blocks).
*/
static ssize_t block_size_bytes_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx\n", get_memory_block_size());
+ return sprintf(buf, "%lx\n", memory_block_size_bytes());
}
static DEVICE_ATTR_RO(block_size_bytes);
@@ -670,10 +656,10 @@ static int init_memory_block(struct memory_block **memory,
return -ENOMEM;
mem->start_section_nr = block_id * sections_per_block;
- mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
start_pfn = section_nr_to_pfn(mem->start_section_nr);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
+ mem->nid = NUMA_NO_NODE;
ret = register_memory(mem);
@@ -810,19 +796,22 @@ static const struct attribute_group *memory_root_attr_groups[] = {
/*
* Initialize the sysfs support for memory devices...
*/
-int __init memory_dev_init(void)
+void __init memory_dev_init(void)
{
int ret;
int err;
unsigned long block_sz, nr;
+ /* Validate the configured memory block size */
+ block_sz = memory_block_size_bytes();
+ if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
+ panic("Memory block size not suitable: 0x%lx\n", block_sz);
+ sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
if (ret)
goto out;
- block_sz = get_memory_block_size();
- sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
-
/*
* Create entries for memory sections that were found
* during boot and have been initialized
@@ -838,8 +827,7 @@ int __init memory_dev_init(void)
out:
if (ret)
- printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
- return ret;
+ panic("%s() failed: %d\n", __func__, ret);
}
/**
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f6535b..296546ffed6c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d AnonHugePages: %8lu kB\n"
"Node %d ShmemHugePages: %8lu kB\n"
"Node %d ShmemPmdMapped: %8lu kB\n"
+ "Node %d FileHugePages: %8lu kB\n"
+ "Node %d FilePmdMapped: %8lu kB\n"
#endif
,
nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
@@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
HPAGE_PMD_NR),
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
+ HPAGE_PMD_NR),
+ nid, K(node_page_state(pgdat, NR_FILE_THPS) *
+ HPAGE_PMD_NR),
+ nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
HPAGE_PMD_NR)
#endif
);
@@ -756,15 +762,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
static int register_mem_sect_under_node(struct memory_block *mem_blk,
void *arg)
{
+ unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
+ unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+ unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
int ret, nid = *(int *)arg;
- unsigned long pfn, sect_start_pfn, sect_end_pfn;
+ unsigned long pfn;
- mem_blk->nid = nid;
-
- sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
- sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
- sect_end_pfn += PAGES_PER_SECTION - 1;
- for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
int page_nid;
/*
@@ -789,6 +793,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
if (page_nid != nid)
continue;
}
+
+ /*
+ * If this memory block spans multiple nodes, we only indicate
+ * the last processed node.
+ */
+ mem_blk->nid = nid;
+
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
&mem_blk->dev.kobj,
kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +815,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
}
/*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
*/
void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
- unsigned long pfn, sect_start_pfn, sect_end_pfn;
- static nodemask_t unlinked_nodes;
-
- nodes_clear(unlinked_nodes);
- sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
- sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
- for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
- int nid;
+ if (mem_blk->nid == NUMA_NO_NODE)
+ return;
- nid = get_nid_for_pfn(pfn);
- if (nid < 0)
- continue;
- if (!node_online(nid))
- continue;
- if (node_test_and_set(nid, unlinked_nodes))
- continue;
- sysfs_remove_link(&node_devices[nid]->dev.kobj,
- kobject_name(&mem_blk->dev.kobj));
- sysfs_remove_link(&mem_blk->dev.kobj,
- kobject_name(&node_devices[nid]->dev.kobj));
- }
+ sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+ kobject_name(&mem_blk->dev.kobj));
+ sysfs_remove_link(&mem_blk->dev.kobj,
+ kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
}
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
index c58986556161..651bd0236a99 100644
--- a/drivers/block/drbd/drbd_interval.c
+++ b/drivers/block/drbd/drbd_interval.c
@@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node)
return this->end;
}
-/**
- * compute_subtree_last - compute end of @node
- *
- * The end of an interval is the highest (start + (size >> 9)) value of this
- * node and of its children. Called for @node and its parents whenever the end
- * may have changed.
- */
-static inline sector_t
-compute_subtree_last(struct drbd_interval *node)
-{
- sector_t max = node->sector + (node->size >> 9);
-
- if (node->rb.rb_left) {
- sector_t left = interval_end(node->rb.rb_left);
- if (left > max)
- max = left;
- }
- if (node->rb.rb_right) {
- sector_t right = interval_end(node->rb.rb_right);
- if (right > max)
- max = right;
- }
- return max;
-}
+#define NODE_END(node) ((node)->sector + ((node)->size >> 9))
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
- sector_t, end, compute_subtree_last);
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+ struct drbd_interval, rb, sector_t, end, NODE_END);
/**
* drbd_insert_interval - insert a new interval into a tree
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index c70cb5f272cf..0891ab829b1b 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1078,7 +1078,7 @@ new_buf:
bool merge;
if (page)
- pg_size <<= compound_order(page);
+ pg_size = page_size(page);
if (off < pg_size &&
skb_can_coalesce(skb, i, page, off)) {
merge = 1;
@@ -1105,8 +1105,7 @@ new_buf:
__GFP_NORETRY,
order);
if (page)
- pg_size <<=
- compound_order(page);
+ pg_size <<= order;
}
if (!page) {
page = alloc_page(gfp);
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index feaa538026a0..3db000aacd26 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev,
static void
via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
{
- struct page *page;
int i;
switch (vsg->state) {
@@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
kfree(vsg->desc_pages);
/* fall through */
case dr_via_pages_locked:
- for (i = 0; i < vsg->num_pages; ++i) {
- if (NULL != (page = vsg->pages[i])) {
- if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction))
- SetPageDirty(page);
- put_page(page);
- }
- }
+ put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
+ (vsg->direction == DMA_FROM_DEVICE));
/* fall through */
case dr_via_pages_alloc:
vfree(vsg->pages);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 41f9e268e3fb..24244a2f68cc 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
page = sg_page_iter_page(&sg_iter);
- if (umem->writable && dirty)
- put_user_pages_dirty_lock(&page, 1);
- else
- put_user_page(page);
+ put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
}
sg_free_table(&umem->sg_head);
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index b89a9b9aef7a..469acb961fbd 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
size_t npages, bool dirty)
{
- if (dirty)
- put_user_pages_dirty_lock(p, npages);
- else
- put_user_pages(p, npages);
+ put_user_pages_dirty_lock(p, npages, dirty);
if (mm) { /* during close after signal, mm can be NULL */
atomic64_sub(npages, &mm->pinned_vm);
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index bfbfbb7e0ff4..6bf764e41891 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -40,10 +40,7 @@
static void __qib_release_user_pages(struct page **p, size_t num_pages,
int dirty)
{
- if (dirty)
- put_user_pages_dirty_lock(p, num_pages);
- else
- put_user_pages(p, num_pages);
+ put_user_pages_dirty_lock(p, num_pages, dirty);
}
/**
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 0b0237d41613..62e6ffa9ad78 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
for_each_sg(chunk->page_list, sg, chunk->nents, i) {
page = sg_page(sg);
pa = sg_phys(sg);
- if (dirty)
- put_user_pages_dirty_lock(&page, 1);
- else
- put_user_page(page);
+ put_user_pages_dirty_lock(&page, 1, dirty);
usnic_dbg("pa: %pa\n", &pa);
}
kfree(chunk);
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index 87a56039f0ef..e99983f07663 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
bool dirty)
{
- struct page **p = chunk->plist;
-
- while (num_pages--) {
- if (!PageDirty(*p) && dirty)
- put_user_pages_dirty_lock(p, 1);
- else
- put_user_page(*p);
- p++;
- }
+ put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
}
void siw_umem_release(struct siw_umem *umem, bool dirty)
diff --git a/drivers/misc/sram-exec.c b/drivers/misc/sram-exec.c
index 426ad912b441..d054e2842a5f 100644
--- a/drivers/misc/sram-exec.c
+++ b/drivers/misc/sram-exec.c
@@ -96,7 +96,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
if (!part)
return NULL;
- if (!addr_in_gen_pool(pool, (unsigned long)dst, size))
+ if (!gen_pool_has_addr(pool, (unsigned long)dst, size))
return NULL;
base = (unsigned long)part->base;
diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c
index aa8d8425be25..b83a1d16bd89 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@ -120,7 +120,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap,
if (!page)
goto free_pages;
list_add_tail(&page->lru, &pages);
- size_remaining -= PAGE_SIZE << compound_order(page);
+ size_remaining -= page_size(page);
max_order = compound_order(page);
i++;
}
@@ -133,7 +133,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap,
sg = table->sgl;
list_for_each_entry_safe(page, tmp_page, &pages, lru) {
- sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0);
+ sg_set_page(sg, page, page_size(page), 0);
sg = sg_next(sg);
list_del(&page->lru);
}
diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c
index a254792d882c..1354a157e9af 100644
--- a/drivers/target/tcm_fc/tfc_io.c
+++ b/drivers/target/tcm_fc/tfc_io.c
@@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd)
page, off_in_page, tlen);
fr_len(fp) += tlen;
fp_skb(fp)->data_len += tlen;
- fp_skb(fp)->truesize +=
- PAGE_SIZE << compound_order(page);
+ fp_skb(fp)->truesize += page_size(page);
} else {
BUG_ON(!page);
from = kmap_atomic(page + (mem_off >> PAGE_SHIFT));
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index babef8b00daf..646dfdc46958 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -176,13 +176,13 @@ put_exit:
}
static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
- unsigned int page_shift)
+ unsigned int it_page_shift)
{
struct page *page;
unsigned long size = 0;
- if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
- return size == (1UL << page_shift);
+ if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
+ return size == (1UL << it_page_shift);
page = pfn_to_page(hpa >> PAGE_SHIFT);
/*
@@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
* a page we just found. Otherwise the hardware can get access to
* a bigger memory chunk that it should.
*/
- return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+ return page_shift(compound_head(page)) >= it_page_shift;
}
static inline bool tce_groups_attached(struct tce_container *container)
diff --git a/fs/aio.c b/fs/aio.c
index 01e0fb9ae45a..f9f441b59966 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1287,12 +1287,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ wait_event_interruptible_hrtimeout(ctx->wait,
+ aio_read_events(ctx, min_nr, nr, event, &ret),
+ until);
return ret;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4e11b2e04f6..cec3b4146440 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -670,26 +670,6 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-#ifndef STACK_RND_MASK
-#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
-#endif
-
-static unsigned long randomize_stack_top(unsigned long stack_top)
-{
- unsigned long random_variable = 0;
-
- if (current->flags & PF_RANDOMIZE) {
- random_variable = get_random_long();
- random_variable &= STACK_RND_MASK;
- random_variable <<= PAGE_SHIFT;
- }
-#ifdef CONFIG_STACK_GROWSUP
- return PAGE_ALIGN(stack_top) + random_variable;
-#else
- return PAGE_ALIGN(stack_top) - random_variable;
-#endif
-}
-
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..131d39ec7d31 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1034,6 +1047,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1045,6 +1064,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3223,6 +3260,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3261,11 +3303,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3289,6 +3338,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fde74bebf0c2..ada78a0f49bf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -14,6 +14,7 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
+#include <linux/psi.h>
#include <linux/swap.h>
#include <linux/prefetch.h>
#include <linux/uio.h>
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 922a0c6ba46c..3ed41a002ace 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -51,6 +51,7 @@ struct fat_mount_options {
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
discard:1, /* Issue discard requests on deletions */
+ barrier:1, /* Issue FLUSH command */
dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */
};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4614c0ba5f1c..411069da8d7f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -194,17 +194,21 @@ static int fat_file_release(struct inode *inode, struct file *filp)
int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int err;
err = __generic_file_fsync(filp, start, end, datasync);
if (err)
return err;
- err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+ err = sync_mapping_buffers(sbi->fat_inode->i_mapping);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (sbi->options.barrier)
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
+ return err;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 05689198f5af..0bc2abc5d453 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1011,6 +1011,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
+ if (!opts->barrier)
+ seq_puts(m, ",nobarrier");
if (opts->dos1xfloppy)
seq_puts(m, ",dos1xfloppy");
@@ -1026,8 +1028,9 @@ enum {
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
- Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+ Opt_err_panic, Opt_err_ro, Opt_discard, Opt_barrier, Opt_nobarrier,
+ Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro,
+ Opt_err, Opt_dos1xfloppy,
};
static const match_table_t fat_tokens = {
@@ -1057,6 +1060,8 @@ static const match_table_t fat_tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_discard, "discard"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_nfs_stale_rw, "nfs"},
{Opt_nfs_stale_rw, "nfs=stale_rw"},
{Opt_nfs_nostale_ro, "nfs=nostale_ro"},
@@ -1141,6 +1146,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
opts->numtail = 1;
opts->usefree = opts->nocase = 0;
opts->tz_set = 0;
+ opts->barrier = 1;
opts->nfs = 0;
opts->errors = FAT_ERRORS_RO;
*debug = 0;
@@ -1264,6 +1270,15 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_err_ro:
opts->errors = FAT_ERRORS_RO;
break;
+ case Opt_discard:
+ opts->discard = 1;
+ break;
+ case Opt_barrier:
+ opts->barrier = 1;
+ break;
+ case Opt_nobarrier:
+ opts->barrier = 0;
+ break;
case Opt_nfs_stale_rw:
opts->nfs = FAT_NFS_STALE_RW;
break;
@@ -1327,9 +1342,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_rodir:
opts->rodir = 1;
break;
- case Opt_discard:
- opts->discard = 1;
- break;
/* obsolete mount options */
case Opt_obsolete:
diff --git a/fs/inode.c b/fs/inode.c
index 0f1e3b563c47..ef33fdf0105f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->flags = 0;
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_set(&mapping->nr_thps, 0);
+#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17dfe57c57f8..e9eb0efb81ab 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3162,7 +3162,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
}
page = virt_to_head_page(ptr);
- if (sz > (PAGE_SIZE << compound_order(page)))
+ if (sz > page_size(page))
return -EINVAL;
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 953990eb70a9..1c58859aa592 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_inode_add_write);
-EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index afc06daee5bb..bee8498d7792 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2622,18 +2622,6 @@ done:
return 0;
}
-int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
-{
- return jbd2_journal_file_inode(handle, jinode,
- JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
-}
-
-int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
-{
- return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
- LLONG_MAX);
-}
-
int jbd2_journal_inode_ranged_write(handle_t *handle,
struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0c335b51043d..f9baefc76cf9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct buffer_head *data_alloc_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
+ struct ocfs2_journal *journal = osb->journal;
BUG_ON(inode_trylock(tl_inode));
@@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
+ /* Appending truncate log(TA) and and flushing truncate log(TF) are
+ * two separated transactions. They can be both committed but not
+ * checkpointed. If crash occurs then, both two transaction will be
+ * replayed with several already released to global bitmap clusters.
+ * Then truncate log will be replayed resulting in cluster double free.
+ */
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
data_alloc_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
@@ -6792,6 +6807,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
struct page *page, int zero, u64 *phys)
{
int ret, partial = 0;
+ loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
+ loff_t length = to - from;
ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
if (ret)
@@ -6811,7 +6828,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
if (ret < 0)
mlog_errno(ret);
else if (ocfs2_should_order_data(inode)) {
- ret = ocfs2_jbd2_file_inode(handle, inode);
+ ret = ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
if (ret < 0)
mlog_errno(ret);
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a4c905d6b575..9cd0a6815933 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode,
if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
+ user_pos, user_len);
block_commit_write(tmppage, from, to);
}
@@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (handle && ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(handle, inode);
+ if (handle && ocfs2_should_order_data(inode)) {
+ loff_t start_byte =
+ ((loff_t)tmppage->index << PAGE_SHIFT) +
+ from;
+ loff_t length = to - from;
+ ocfs2_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
+ }
block_commit_write(tmppage, from, to);
}
}
@@ -2042,7 +2049,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2139,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2229,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 429e6a8359a5..eaf042feaf5e 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
-static struct dentry *blockcheck_debugfs_create(const char *name,
- struct dentry *parent,
- u64 *value)
-{
- return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
- &blockcheck_fops);
-}
-
static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
{
if (stats) {
@@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
struct dentry *parent)
{
- stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+ struct dentry *dir;
+
+ dir = debugfs_create_dir("blockcheck", parent);
+ stats->b_debug_dir = dir;
+
+ debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
+ &stats->b_check_count, &blockcheck_fops);
- blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir,
- &stats->b_check_count);
+ debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
+ &stats->b_failure_count, &blockcheck_fops);
- blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir,
- &stats->b_failure_count);
+ debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
+ &stats->b_recover_count, &blockcheck_fops);
- blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir,
- &stats->b_recover_count);
}
#else
static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f1b613327ac8..a368350d4c27 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -225,10 +225,6 @@ struct o2hb_region {
unsigned int hr_region_num;
struct dentry *hr_debug_dir;
- struct dentry *hr_debug_livenodes;
- struct dentry *hr_debug_regnum;
- struct dentry *hr_debug_elapsed_time;
- struct dentry *hr_debug_pinned;
struct o2hb_debug_buf *hr_db_livenodes;
struct o2hb_debug_buf *hr_db_regnum;
struct o2hb_debug_buf *hr_db_elapsed_time;
@@ -1394,21 +1390,20 @@ void o2hb_exit(void)
kfree(o2hb_db_failedregions);
}
-static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
- struct o2hb_debug_buf **db, int db_len,
- int type, int size, int len, void *data)
+static void o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len, int type,
+ int size, int len, void *data)
{
*db = kmalloc(db_len, GFP_KERNEL);
if (!*db)
- return NULL;
+ return;
(*db)->db_type = type;
(*db)->db_size = size;
(*db)->db_len = len;
(*db)->db_data = data;
- return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
- &o2hb_debug_fops);
+ debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
}
static void o2hb_debug_init(void)
@@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slots);
- debugfs_remove(reg->hr_debug_livenodes);
- debugfs_remove(reg->hr_debug_regnum);
- debugfs_remove(reg->hr_debug_elapsed_time);
- debugfs_remove(reg->hr_debug_pinned);
- debugfs_remove(reg->hr_debug_dir);
+ debugfs_remove_recursive(reg->hr_debug_dir);
kfree(reg->hr_db_livenodes);
kfree(reg->hr_db_regnum);
kfree(reg->hr_db_elapsed_time);
@@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
-static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+static void o2hb_debug_region_init(struct o2hb_region *reg,
+ struct dentry *parent)
{
- int ret = -ENOMEM;
+ struct dentry *dir;
- reg->hr_debug_dir =
- debugfs_create_dir(config_item_name(&reg->hr_item), dir);
- if (!reg->hr_debug_dir) {
- mlog_errno(ret);
- goto bail;
- }
+ dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent);
+ reg->hr_debug_dir = dir;
- reg->hr_debug_livenodes =
- o2hb_debug_create(O2HB_DEBUG_LIVENODES,
- reg->hr_debug_dir,
- &(reg->hr_db_livenodes),
- sizeof(*(reg->hr_db_livenodes)),
- O2HB_DB_TYPE_REGION_LIVENODES,
- sizeof(reg->hr_live_node_bitmap),
- O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_livenodes) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
+ reg);
- reg->hr_debug_regnum =
- o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
- reg->hr_debug_dir,
- &(reg->hr_db_regnum),
- sizeof(*(reg->hr_db_regnum)),
- O2HB_DB_TYPE_REGION_NUMBER,
- 0, O2NM_MAX_NODES, reg);
- if (!reg->hr_debug_regnum) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
- reg->hr_debug_elapsed_time =
- o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
- reg->hr_debug_dir,
- &(reg->hr_db_elapsed_time),
- sizeof(*(reg->hr_db_elapsed_time)),
- O2HB_DB_TYPE_REGION_ELAPSED_TIME,
- 0, 0, reg);
- if (!reg->hr_debug_elapsed_time) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
- reg->hr_debug_pinned =
- o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
- reg->hr_debug_dir,
- &(reg->hr_db_pinned),
- sizeof(*(reg->hr_db_pinned)),
- O2HB_DB_TYPE_REGION_PINNED,
- 0, 0, reg);
- if (!reg->hr_debug_pinned) {
- mlog_errno(ret);
- goto bail;
- }
+ o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
+ sizeof(*(reg->hr_db_pinned)),
+ O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
- ret = 0;
-bail:
- return ret;
}
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
@@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
if (ret)
goto unregister_handler;
- ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
- if (ret) {
- config_item_put(&reg->hr_item);
- goto unregister_handler;
- }
+ o2hb_debug_region_init(reg, o2hb_debug_dir);
return &reg->hr_item;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 784426dee56c..bdef72c0f099 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
int i, j, num_used;
u32 major_hash;
struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
- struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+ struct ocfs2_dx_entry_list *orig_list, *tmp_list;
struct ocfs2_dx_entry *dx_entry;
tmp_list = &tmp_dx_leaf->dl_list;
@@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
orig_list = &orig_dx_leaf->dl_list;
new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
- new_list = &new_dx_leaf->dl_list;
num_used = le16_to_cpu(orig_list->de_num_used);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 69a429b625cc..aaf24548b02a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -142,7 +142,6 @@ struct dlm_ctxt
atomic_t res_tot_count;
atomic_t res_cur_count;
- struct dlm_debug_ctxt *dlm_debug_ctxt;
struct dentry *dlm_debugfs_subroot;
/* NOTE: Next three are protected by dlm_domain_lock */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index a4b58ba99927..4d0b452012b2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = {
/* files in subroot */
void dlm_debug_init(struct dlm_ctxt *dlm)
{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
/* for dumping dlm_ctxt */
- dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_state_fops);
+ debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
/* for dumping lockres */
- dc->debug_lockres_dentry =
- debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_lockres_fops);
+ debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
/* for dumping mles */
- dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_mle_fops);
+ debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
/* for dumping lockres on the purge list */
- dc->debug_purgelist_dentry =
- debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
- S_IFREG|S_IRUSR,
- dlm->dlm_debugfs_subroot,
- dlm, &debug_purgelist_fops);
-}
-
-void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
- struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
- if (dc) {
- debugfs_remove(dc->debug_purgelist_dentry);
- debugfs_remove(dc->debug_mle_dentry);
- debugfs_remove(dc->debug_lockres_dentry);
- debugfs_remove(dc->debug_state_dentry);
- kfree(dc);
- dc = NULL;
- }
+ debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
+ dlm->dlm_debugfs_subroot, dlm,
+ &debug_purgelist_fops);
}
/* subroot - domain dir */
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
- dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
- GFP_KERNEL);
- if (!dlm->dlm_debug_ctxt) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
dlm_debugfs_root);
- return 0;
}
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
- debugfs_remove(dlm->dlm_debugfs_subroot);
+ debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
}
/* debugfs root */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 7d0c7c9013ce..f8fd8680a4b6 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle);
#ifdef CONFIG_DEBUG_FS
-struct dlm_debug_ctxt {
- struct dentry *debug_state_dentry;
- struct dentry *debug_lockres_dentry;
- struct dentry *debug_mle_dentry;
- struct dentry *debug_purgelist_dentry;
-};
-
struct debug_lockres {
int dl_len;
char *dl_buf;
@@ -29,9 +22,8 @@ struct debug_lockres {
};
void dlm_debug_init(struct dlm_ctxt *dlm);
-void dlm_debug_shutdown(struct dlm_ctxt *dlm);
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_create_debugfs_root(void);
@@ -42,12 +34,8 @@ void dlm_destroy_debugfs_root(void);
static inline void dlm_debug_init(struct dlm_ctxt *dlm)
{
}
-static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
-}
-static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
- return 0;
}
static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7338b5d4647c..ee6f459f9770 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -1938,7 +1937,6 @@ bail:
if (status) {
dlm_unregister_domain_handlers(dlm);
- dlm_debug_shutdown(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
dlm_destroy_dlm_worker(dlm);
@@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->key = key;
dlm->node_num = o2nm_this_node();
- ret = dlm_create_debugfs_subroot(dlm);
- if (ret < 0)
- goto leave;
+ dlm_create_debugfs_subroot(dlm);
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
@@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "context init: refcount %u\n",
kref_read(&dlm->dlm_refs));
+ ret = 0;
leave:
if (ret < 0 && dlm) {
if (dlm->master_hash)
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index e78657742bd8..3883633e82eb 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
enum dlm_status status;
int actions = 0;
int in_use;
- u8 owner;
+ u8 owner;
+ int recovery_wait = 0;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
@@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
- else
- lock->unlock_pending = 0;
-
+ else {
+ if (!lock->unlock_pending)
+ recovery_wait = 1;
+ else
+ lock->unlock_pending = 0;
+ }
}
/* get an extra ref on lock. if we are just switching
@@ -229,6 +233,17 @@ leave:
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ if (recovery_wait) {
+ spin_lock(&res->spinlock);
+ /* Unlock request will directly succeed after owner dies,
+ * and the lock is already removed from grant list. We have to
+ * wait for RECOVERING done or we miss the chance to purge it
+ * since the removement is much faster than RECOVERING proc.
+ */
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+ spin_unlock(&res->spinlock);
+ }
+
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 14207234fa3d..ad594fef2ab0 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3012,8 +3012,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
kref_init(&dlm_debug->d_refcnt);
INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
- dlm_debug->d_locking_state = NULL;
- dlm_debug->d_locking_filter = NULL;
dlm_debug->d_filter_secs = 0;
out:
return dlm_debug;
@@ -3282,27 +3280,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- dlm_debug->d_locking_state = debugfs_create_file("locking_state",
- S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_dlm_debug_fops);
+ debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+ osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
- dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter",
- 0600,
- osb->osb_debug_root,
- &dlm_debug->d_filter_secs);
+ debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+ &dlm_debug->d_filter_secs);
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
{
struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
- if (dlm_debug) {
- debugfs_remove(dlm_debug->d_locking_state);
- debugfs_remove(dlm_debug->d_locking_filter);
+ if (dlm_debug)
ocfs2_put_dlm_debug(dlm_debug);
- }
}
int ocfs2_dlm_init(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4435df3e5adb..2e982db3e1ae 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -706,7 +706,9 @@ leave:
* Thus, we need to explicitly order the zeroed pages.
*/
static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
- struct buffer_head *di_bh)
+ struct buffer_head *di_bh,
+ loff_t start_byte,
+ loff_t length)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
goto out;
}
- ret = ocfs2_jbd2_file_inode(handle, inode);
+ ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
- handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+ handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
+ abs_from,
+ abs_to - abs_from);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -2126,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
- loff_t end;
/*
* We start with a read level meta lock and only jump to an ex
@@ -2190,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
}
}
- end = pos + count;
-
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
ocfs2_inode_unlock(inode, meta_level);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d6f7b299eb23..efeea208fdeb 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index c0fe6ed08ab1..3103ba7f97a2 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
-void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
void ocfs2_complete_recovery(struct work_struct *work);
void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -232,8 +231,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* ocfs2_journal_access_*() unless you intend to
* manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
- * the current handle commits.
+ * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes
+ * out before the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
@@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
* previous dirblock update in the free list */
static inline int ocfs2_link_credits(struct super_block *sb)
{
- return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
ocfs2_quota_trans_credits(sb);
}
@@ -575,37 +574,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
return ocfs2_extent_recs_per_gd(sb);
}
-static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
- unsigned int clusters_to_del,
- struct ocfs2_dinode *fe,
- struct ocfs2_extent_list *last_el)
+static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
+ loff_t start_byte, loff_t length)
{
- /* for dinode + all headers in this pass + update to next leaf */
- u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
- u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
- int credits = 1 + tree_depth + 1;
- int i;
-
- i = next_free - 1;
- BUG_ON(i < 0);
-
- /* We may be deleting metadata blocks, so metadata alloc dinode +
- one desc. block for each possible delete. */
- if (tree_depth && next_free == 1 &&
- ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
- credits += 1 + tree_depth;
-
- /* update to the truncate log. */
- credits += OCFS2_TRUNCATE_LOG_UPDATE;
-
- credits += ocfs2_quota_trans_credits(sb);
-
- return credits;
-}
-
-static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
-{
- return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
+ return jbd2_journal_inode_ranged_write(handle,
+ &OCFS2_I(inode)->ip_jinode,
+ start_byte, length);
}
static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6f8e1c4fdb9c..8ea51cf27b97 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct inode *inode = NULL;
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
- struct ocfs2_dinode *di = NULL;
handle_t *handle = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
struct buffer_head *parent_di_bh = NULL;
@@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
goto leave;
}
- di = (struct ocfs2_dinode *)new_di_bh->b_data;
status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
&orphan_insert, orphan_dir, false);
if (status < 0) {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fddbbd60f434..9150cfa4df7d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -223,8 +223,6 @@ struct ocfs2_orphan_scan {
struct ocfs2_dlm_debug {
struct kref d_refcnt;
- struct dentry *d_locking_state;
- struct dentry *d_locking_filter;
u32 d_filter_secs;
struct list_head d_lockres_tracking;
};
@@ -401,7 +399,6 @@ struct ocfs2_super
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
- struct dentry *osb_ctxt;
wait_queue_head_t recovery_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8b2f39506648..c81e86c62380 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
- osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
- osb->osb_debug_root,
- osb,
- &ocfs2_osb_debug_fops);
+ debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
+ osb, &ocfs2_osb_debug_fops);
if (ocfs2_meta_ecc(osb))
ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
@@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
kset_unregister(osb->osb_dev_kset);
- debugfs_remove(osb->osb_ctxt);
-
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
@@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
- debugfs_remove(osb->osb_debug_root);
+ debugfs_remove_recursive(osb->osb_debug_root);
if (hangup_needed)
ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..d8507972ee13 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
return loc->xl_ops->xlo_check_space(loc, xi);
}
-static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
-{
- loc->xl_ops->xlo_add_entry(loc, name_hash);
- loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
- /*
- * We can't leave the new entry's xe_name_offset at zero or
- * add_namevalue() will go nuts. We set it to the size of our
- * storage so that it can never be less than any other entry.
- */
- loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
-}
-
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
struct ocfs2_xattr_info *xi)
{
@@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
if (rc)
goto out;
- if (loc->xl_entry) {
- if (ocfs2_xa_can_reuse_entry(loc, xi)) {
- orig_value_size = loc->xl_entry->xe_value_size;
- rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
- if (rc)
- goto out;
- goto alloc_value;
- }
+ if (!loc->xl_entry) {
+ rc = -EINVAL;
+ goto out;
+ }
- if (!ocfs2_xattr_is_local(loc->xl_entry)) {
- orig_clusters = ocfs2_xa_value_clusters(loc);
- rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
- if (rc) {
- mlog_errno(rc);
- ocfs2_xa_cleanup_value_truncate(loc,
- "overwriting",
- orig_clusters);
- goto out;
- }
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
+
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
}
- ocfs2_xa_wipe_namevalue(loc);
- } else
- ocfs2_xa_add_entry(loc, name_hash);
+ }
+ ocfs2_xa_wipe_namevalue(loc);
/*
* If we get here, we have a blank entry. Fill it. We grow our
diff --git a/fs/open.c b/fs/open.c
index a59abe3c669a..c60cd22cc052 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f,
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
}
+
+ /*
+ * XXX: Huge page cache doesn't support writing yet. Drop all page
+ * cache for this file before processing writes.
+ */
+ if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
+ truncate_pagecache(inode, 0);
+
return 0;
cleanup_all:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 465ea0153b2a..ac9247371871 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -8,7 +8,6 @@
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
#include <linux/percpu.h>
-#include <linux/quicklist.h>
#include <linux/seq_file.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
@@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
-#ifdef CONFIG_QUICKLIST
- show_val_kb(m, "Quicklists: ", quicklist_total_size());
-#endif
show_val_kb(m, "NFS_Unstable: ",
global_node_page_state(NR_UNSTABLE_NFS));
@@ -136,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+ show_val_kb(m, "FileHugePages: ",
+ global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+ show_val_kb(m, "FilePmdMapped: ",
+ global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
#endif
#ifdef CONFIG_CMA
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 544d1ee15aee..decd3fe39674 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -95,7 +95,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -107,22 +110,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -133,14 +136,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -148,23 +150,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -177,7 +179,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bf43d1d60059..9442631fd4af 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -417,6 +417,7 @@ struct mem_size_stats {
unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
+ unsigned long file_thp;
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
@@ -461,7 +462,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked)
{
- int i, nr = compound ? 1 << compound_order(page) : 1;
+ int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
/*
@@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
else if (is_zone_device_page(page))
/* pass */;
else
- VM_BUG_ON_PAGE(1, page);
+ mss->file_thp += HPAGE_PMD_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
}
#else
@@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+ SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
mss->private_hugetlb >> 10, 7);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d82636e8eb65..35624ca2a2f9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -147,6 +147,17 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
return error;
}
+static int ramfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
+ if (!inode)
+ return -ENOSPC;
+ d_tmpfile(dentry, inode);
+ return 0;
+}
+
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
@@ -157,6 +168,7 @@ static const struct inode_operations ramfs_dir_inode_operations = {
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
+ .tmpfile = ramfs_tmpfile,
};
/*
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 9c02d96d3a42..4075e41408b4 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb)
static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path);
struct buffer_info bi;
int n;
- struct item_head *ih;
RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
"vs- 12000: level: wrong FR %z", tb->FR[0]);
@@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
"PAP-12010: tree can not be empty");
- ih = item_head(tbS0, item_pos);
buffer_info_init_tbS0(tb, &bi);
/* Delete or truncate the item */
@@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
/* part of new item falls into L[0] */
int new_item_len, shift;
- int version;
ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
@@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
min_t(int, tb->zeroes_num, ih_item_len(ih)));
- version = ih_version(ih);
-
/*
* Calculate key component, item length and body to
* insert into S[0]
@@ -632,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int n = B_NR_ITEMS(tbS0);
struct buffer_info bi;
- int ret;
/* new item or part of it doesn't fall into R[0] */
if (n - tb->rnum[0] >= tb->item_pos) {
@@ -646,13 +639,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
loff_t old_key_comp, old_len, r_zeroes_number;
const char *r_body;
- int version, shift;
+ int shift;
loff_t offset;
leaf_shift_right(tb, tb->rnum[0] - 1, -1);
- version = ih_version(ih);
-
/* Remember key component and item length */
old_key_comp = le_ih_k_offset(ih);
old_len = ih_item_len(ih);
@@ -698,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
/* whole new item falls into R[0] */
/* Shift rnum[0]-1 items to R[0] */
- ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+ leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
/* Insert new item into R[0] */
buffer_info_init_right(tb, &bi);
@@ -950,14 +941,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
int old_key_comp, old_len, r_zeroes_number;
const char *r_body;
- int version;
/* Move snum[i]-1 items from S[0] to S_new[i] */
leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
tb->S_new[i]);
/* Remember key component and item length */
- version = ih_version(ih);
old_key_comp = le_ih_k_offset(ih);
old_len = ih_item_len(ih);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6b0ddb2a9091..117092224111 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
int to, int to_bytes, short *snum012, int flow)
{
int i;
- int cur_free;
int units;
struct virtual_node *vn = tb->tb_vn;
int total_node_size, max_node_size, current_item_size;
@@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
/* leaf level */
needed_nodes = 1;
total_node_size = 0;
- cur_free = max_node_size;
/* start from 'from'-th item */
start_item = from;
@@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
* and Fh is its father.
*/
struct buffer_head *Sh, *Fh;
- int maxsize, ret;
+ int ret;
int lfree, rfree /* free space in L and R */ ;
Sh = PATH_H_PBUFFER(tb->tb_path, h);
Fh = PATH_H_PPARENT(tb->tb_path, h);
- maxsize = MAX_CHILD_SIZE(Sh);
-
/*
* using tb->insert_size[h], which is negative in this case,
* create_virtual_node calculates:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4517a1394c6f..4b3e3e73b512 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s,
struct list_head *entry;
unsigned int trans_id = jl->j_trans_id;
unsigned int other_trans_id;
- unsigned int first_trans_id;
find_first:
/*
@@ -914,8 +913,6 @@ find_first:
return 0;
}
- first_trans_id = first_jl->j_trans_id;
-
entry = &first_jl->j_list;
while (1) {
other_jl = JOURNAL_LIST_ENTRY(entry);
@@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
struct reiserfs_journal_list *pjl;
- struct reiserfs_journal_cnode *cn, *last;
+ struct reiserfs_journal_cnode *cn;
int count;
int was_jwait = 0;
int was_dirty = 0;
@@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s,
b_blocknr, __func__);
}
free_cnode:
- last = cn;
cn = cn->next;
if (saved_bh) {
/*
@@ -1792,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s,
{
unsigned long len = 0;
unsigned long cur_len;
- int ret;
int i;
int limit = 256;
struct reiserfs_journal_list *tjl;
@@ -1829,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s,
* transactions, but only bother if we've actually spanned
* across multiple lists
*/
- if (flush_jl != jl) {
- ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
- }
+ if (flush_jl != jl)
+ kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+
flush_journal_list(s, flush_jl, 1);
put_journal_list(s, flush_jl);
put_journal_list(s, jl);
@@ -1911,7 +1906,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb, int error)
{
struct reiserfs_transaction_handle myth;
- int flushed = 0;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
/*
@@ -1933,7 +1927,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1);
journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
do_journal_end(&myth, FLUSH_ALL);
- flushed = 1;
}
}
@@ -3444,9 +3437,8 @@ static int remove_from_transaction(struct super_block *sb,
if (cn == journal->j_last) {
journal->j_last = cn->prev;
}
- if (bh)
- remove_journal_hash(sb, journal->j_hash_table, NULL,
- bh->b_blocknr, 0);
+ remove_journal_hash(sb, journal->j_hash_table, NULL,
+ bh->b_blocknr, 0);
clear_buffer_journaled(bh); /* don't log this one */
if (!already_cleaned) {
@@ -3988,7 +3980,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
struct buffer_head *c_bh; /* commit bh */
struct buffer_head *d_bh; /* desc bh */
int cur_write_start = 0; /* start index of current log write */
- int old_start;
int i;
int flush;
int wait_on_commit;
@@ -4245,7 +4236,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
journal->j_num_work_lists++;
/* reset journal values for the next transaction */
- old_start = journal->j_start;
journal->j_start =
(journal->j_start + journal->j_len +
2) % SB_ONDISK_JOURNAL_SIZE(sb);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index f5cebd70d903..7f868569d4d0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi,
char *item;
struct reiserfs_de_head *deh;
char *insert_point;
- int i, old_entry_num;
+ int i;
struct buffer_head *bh = bi->bi_bh;
if (new_entry_count == 0)
@@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi,
put_deh_location(&deh[i],
deh_location(&deh[i]) + paste_size);
- old_entry_num = ih_entry_count(ih);
put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
/* prepare space for pasted records */
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 415d66ca87d1..34baf5c0f265 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
int old_max = sb_oid_maxsize(disk_sb);
struct reiserfs_super_block_v1 *disk_sb_v1;
- __le32 *objectid_map, *new_objectid_map;
+ __le32 *objectid_map;
int i;
disk_sb_v1 =
(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
objectid_map = (__le32 *) (disk_sb_v1 + 1);
- new_objectid_map = (__le32 *) (disk_sb + 1);
if (cur_size > new_size) {
/*
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 9fed1c05f1f4..500f2000eb41 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh)
static void check_internal_block_head(struct buffer_head *bh)
{
- struct block_head *blkh;
-
- blkh = B_BLK_HEAD(bh);
if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea97d39..da9ebe33882b 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
struct buffer_head *bh;
struct path_element *last_element;
int node_level, retval;
- int right_neighbor_of_leaf_node;
int fs_gen;
struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
@@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
pathrelse(search_path);
- right_neighbor_of_leaf_node = 0;
-
/*
* With each iteration of this loop we search through the items in the
* current node, and calculate the next current node(next path element)
@@ -701,7 +698,6 @@ io_error:
*/
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
- right_neighbor_of_leaf_node = 0;
/* repeat search from the root */
continue;
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index aa6c093d9ce9..a21e83f8a274 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -10,6 +10,7 @@
#define BUGFLAG_WARNING (1 << 0)
#define BUGFLAG_ONCE (1 << 1)
#define BUGFLAG_DONE (1 << 2)
+#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */
#define BUGFLAG_TAINT(taint) ((taint) << 8)
#define BUG_GET_TAINT(bug) ((bug)->flags >> 8)
#endif
@@ -61,18 +62,6 @@ struct bug_entry {
#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
#endif
-#ifdef __WARN_FLAGS
-#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint))
-#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint))
-
-#define WARN_ON_ONCE(condition) ({ \
- int __ret_warn_on = !!(condition); \
- if (unlikely(__ret_warn_on)) \
- __WARN_ONCE_TAINT(TAINT_WARN); \
- unlikely(__ret_warn_on); \
-})
-#endif
-
/*
* WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
* significant kernel issues that need prompt attention if they should ever
@@ -89,27 +78,27 @@ struct bug_entry {
*
* Use the versions with printk format strings to provide better diagnostics.
*/
-#ifndef __WARN_TAINT
-extern __printf(3, 4)
-void warn_slowpath_fmt(const char *file, const int line,
- const char *fmt, ...);
+#ifndef __WARN_FLAGS
extern __printf(4, 5)
-void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint,
- const char *fmt, ...);
-extern void warn_slowpath_null(const char *file, const int line);
-#define WANT_WARN_ON_SLOWPATH
-#define __WARN() warn_slowpath_null(__FILE__, __LINE__)
-#define __WARN_printf(arg...) warn_slowpath_fmt(__FILE__, __LINE__, arg)
-#define __WARN_printf_taint(taint, arg...) \
- warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
+void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
+ const char *fmt, ...);
+#define __WARN() __WARN_printf(TAINT_WARN, NULL)
+#define __WARN_printf(taint, arg...) \
+ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg)
#else
extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
-#define __WARN() do { \
- printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \
-} while (0)
-#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg)
-#define __WARN_printf_taint(taint, arg...) \
- do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
+#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
+#define __WARN_printf(taint, arg...) do { \
+ __warn_printk(arg); \
+ __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+ } while (0)
+#define WARN_ON_ONCE(condition) ({ \
+ int __ret_warn_on = !!(condition); \
+ if (unlikely(__ret_warn_on)) \
+ __WARN_FLAGS(BUGFLAG_ONCE | \
+ BUGFLAG_TAINT(TAINT_WARN)); \
+ unlikely(__ret_warn_on); \
+})
#endif
/* used internally by panic.c */
@@ -132,7 +121,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
#define WARN(condition, format...) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
- __WARN_printf(format); \
+ __WARN_printf(TAINT_WARN, format); \
unlikely(__ret_warn_on); \
})
#endif
@@ -140,7 +129,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
#define WARN_TAINT(condition, taint, format...) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
- __WARN_printf_taint(taint, format); \
+ __WARN_printf(taint, format); \
unlikely(__ret_warn_on); \
})
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 8476175c07e7..6f8cc06ee44e 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -102,11 +102,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
__free_page(pte_page);
}
-#else /* CONFIG_MMU */
-
-/* This is enough for a nommu architecture */
-#define check_pgt_cache() do { } while (0)
-
#endif /* CONFIG_MMU */
#endif /* __ASM_GENERIC_PGALLOC_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 75d9d68a6de7..818691846c90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
* need this). If THP is not enabled, the pmd can't go away under the
* code even if MADV_DONTNEED runs, but if THP is enabled we need to
* run a pmd_trans_unstable before walking the ptes after
- * split_huge_page_pmd returns (because it may have run when the pmd
- * become null, but then a page fault can map in a THP and not a
- * regular page).
+ * split_huge_pmd returns (because it may have run when the pmd become
+ * null, but then a page fault can map in a THP and not a regular page).
*/
static inline int pmd_trans_unstable(pmd_t *pmd)
{
@@ -1126,7 +1125,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
static inline void init_espfix_bsp(void) { }
#endif
-extern void __init pgd_cache_init(void);
+extern void __init pgtable_cache_init(void);
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9569e7c786d3..4b898cdbdf05 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result)
return false;
}
-/*
- * Compaction has backed off for some reason. It might be throttling or
- * lock contention. Retrying is still worthwhile.
- */
-static inline bool compaction_withdrawn(enum compact_result result)
+/* Compaction needs reclaim to be performed first, so it can continue. */
+static inline bool compaction_needs_reclaim(enum compact_result result)
{
/*
* Compaction backed off due to watermark checks for order-0
@@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result)
if (result == COMPACT_SKIPPED)
return true;
+ return false;
+}
+
+/*
+ * Compaction has backed off for some reason after doing some work or none
+ * at all. It might be throttling or lock contention. Retrying might be still
+ * worthwhile, but with a higher priority if allowed.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
@@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result)
return false;
}
+static inline bool compaction_needs_reclaim(enum compact_result result)
+{
+ return false;
+}
+
static inline bool compaction_withdrawn(enum compact_result result)
{
return true;
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b5a5a1ed9efd..78a73eba64dd 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -200,8 +200,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_wrap(cpu, mask, start) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask, and) \
- for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
+#define for_each_cpu_and(cpu, mask1, mask2) \
+ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
#else
/**
* cpumask_first - get the first cpu in a cpumask
@@ -290,20 +290,20 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
- * @mask: the first cpumask pointer
- * @and: the second cpumask pointer
+ * @mask1: the first cpumask pointer
+ * @mask2: the second cpumask pointer
*
* This saves a temporary CPU mask in many places. It is equivalent to:
* struct cpumask tmp;
- * cpumask_and(&tmp, &mask, &and);
+ * cpumask_and(&tmp, &mask1, &mask2);
* for_each_cpu(cpu, &tmp)
* ...
*
* After the loop, cpu is >= nr_cpu_ids.
*/
-#define for_each_cpu_and(cpu, mask, and) \
+#define for_each_cpu_and(cpu, mask1, mask2) \
for ((cpu) = -1; \
- (cpu) = cpumask_next_and((cpu), (mask), (and)), \
+ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \
(cpu) < nr_cpu_ids;)
#endif /* SMP */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0fc44f9d5cf6..70785aca5b3c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
* @i_pages: Cached pages.
* @gfp_mask: Memory allocation flags to use for allocating pages.
* @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
* @nrpages: Number of page entries, protected by the i_pages lock.
@@ -446,6 +447,10 @@ struct address_space {
struct xarray i_pages;
gfp_t gfp_mask;
atomic_t i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ /* number of thp, only for non-shmem files */
+ atomic_t nr_thps;
+#endif
struct rb_root_cached i_mmap;
struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages;
@@ -2795,6 +2800,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
return errseq_sample(&mapping->wb_err);
}
+static inline int filemap_nr_thps(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ return atomic_read(&mapping->nr_thps);
+#else
+ return 0;
+#endif
+}
+
+static inline void filemap_nr_thps_inc(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_inc(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
+static inline void filemap_nr_thps_dec(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ atomic_dec(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 4bd583bd6934..5b14a0f38124 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -206,7 +206,7 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev,
int min_alloc_order, int nid, const char *name);
extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+extern bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size);
#ifdef CONFIG_OF
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 45ede62aa85b..c194630cddb5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -162,7 +162,7 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list(page, NULL);
}
-void deferred_split_huge_page(struct page *page);
+void deferred_split_huge_page(struct page *page, unsigned int nr);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct page *page);
@@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * Global or memcg deferred list in the second tail pages is
+ * occupied by compound_head.
+ */
+ return &page[2].deferred_list;
+}
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -315,7 +324,10 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-static inline void deferred_split_huge_page(struct page *page) {}
+static inline void deferred_split_huge_page(struct page *page, unsigned int nr)
+{
+}
+
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edfca4278319..53fc34f930d0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
static inline struct hstate *page_hstate(struct page *page)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
- return size_to_hstate(PAGE_SIZE << compound_order(page));
+ return size_to_hstate(page_size(page));
}
static inline unsigned hstate_index_to_shift(unsigned index)
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 855476145fe1..aaa8a0767aa3 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -30,26 +30,8 @@
\
/* Callbacks for augmented rbtree insert and remove */ \
\
-static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \
-{ \
- ITTYPE max = ITLAST(node), subtree_last; \
- if (node->ITRB.rb_left) { \
- subtree_last = rb_entry(node->ITRB.rb_left, \
- ITSTRUCT, ITRB)->ITSUBTREE; \
- if (max < subtree_last) \
- max = subtree_last; \
- } \
- if (node->ITRB.rb_right) { \
- subtree_last = rb_entry(node->ITRB.rb_right, \
- ITSTRUCT, ITRB)->ITSUBTREE; \
- if (max < subtree_last) \
- max = subtree_last; \
- } \
- return max; \
-} \
- \
-RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \
- ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \
+RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \
+ ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \
\
/* Insert / remove interval nodes from the tree */ \
\
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index df03825ad1a1..603fbc4e2f70 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *);
extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int jbd2_journal_force_commit(journal_t *);
extern int jbd2_journal_force_commit_nested(journal_t *);
-extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
-extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
extern int jbd2_journal_inode_ranged_write(handle_t *handle,
struct jbd2_inode *inode, loff_t start_byte,
loff_t length);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 58b27c7bdc2b..998f77c3a0e1 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -183,6 +183,8 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
bool get_value);
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len);
void * __weak arch_kexec_kernel_image_load(struct kimage *image);
int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
Elf_Shdr *section,
diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index fbf144aaa749..b072aeb1fd78 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -326,8 +326,10 @@ extern atomic_t kgdb_active;
(raw_smp_processor_id() == atomic_read(&kgdb_active))
extern bool dbg_is_early;
extern void __init dbg_late_init(void);
+extern void kgdb_panic(const char *msg);
#else /* ! CONFIG_KGDB */
#define in_dbg_master() (0)
#define dbg_late_init()
+static inline void kgdb_panic(const char *msg) {}
#endif /* ! CONFIG_KGDB */
#endif /* _KGDB_H_ */
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 082d1d2a5216..bc45ea1efbf7 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags);
+#ifdef CONFIG_SHMEM
+extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+#else
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
+#endif
#define khugepaged_enabled() \
(transparent_hugepage_flags & \
@@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
{
return 0;
}
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ad8f1a397ae4..0c762e8ca6a6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -128,9 +128,8 @@ struct mem_cgroup_per_node {
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
-#ifdef CONFIG_MEMCG_KMEM
struct memcg_shrinker_map __rcu *shrinker_map;
-#endif
+
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
@@ -331,6 +330,10 @@ struct mem_cgroup {
struct list_head event_list;
spinlock_t event_list_lock;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ struct deferred_split deferred_split_queue;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
@@ -353,6 +356,19 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ if (in_low_reclaim)
+ return READ_ONCE(memcg->memory.emin);
+
+ return max(READ_ONCE(memcg->memory.emin),
+ READ_ONCE(memcg->memory.elow));
+}
+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
@@ -534,6 +550,8 @@ void mem_cgroup_handle_over_high(void);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -826,6 +844,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+ bool in_low_reclaim)
+{
+ return 0;
+}
+
static inline enum mem_cgroup_protection mem_cgroup_protected(
struct mem_cgroup *root, struct mem_cgroup *memcg)
{
@@ -965,6 +989,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return 0;
}
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
@@ -1311,6 +1340,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
} while ((memcg = parent_mem_cgroup(memcg)));
return false;
}
+
+extern int memcg_expand_shrinker_maps(int new_id);
+
+extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1319,6 +1353,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
return false;
}
+
+static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+ int nid, int shrinker_id)
+{
+}
#endif
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
@@ -1390,10 +1429,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id);
#else
static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
@@ -1435,8 +1470,6 @@ static inline void memcg_put_cache_ids(void)
{
}
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
- int nid, int shrinker_id) { }
#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 02e633f3ede0..0ebb105eb261 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -25,7 +25,6 @@
struct memory_block {
unsigned long start_section_nr;
- unsigned long end_section_nr;
unsigned long state; /* serialized by the dev->lock */
int section_count; /* serialized by mem_sysfs_mutex */
int online_type; /* for passing data to online routine */
@@ -80,9 +79,9 @@ struct mem_section;
#define IPC_CALLBACK_PRI 10
#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int memory_dev_init(void)
+static inline void memory_dev_init(void)
{
- return 0;
+ return;
}
static inline int register_memory_notifier(struct notifier_block *nb)
{
@@ -113,7 +112,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size);
void remove_memory_block_devices(unsigned long start, unsigned long size);
-extern int memory_dev_init(void);
+extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(struct mem_section *);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bef51e35d8d2..cb6ea8582c65 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -122,6 +122,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -132,6 +133,11 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cf955feb823..0023886789e1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -805,6 +805,24 @@ static inline void set_compound_order(struct page *page, unsigned int order)
page[1].compound_order = order;
}
+/* Returns the number of pages in this potentially compound page. */
+static inline unsigned long compound_nr(struct page *page)
+{
+ return 1UL << compound_order(page);
+}
+
+/* Returns the number of bytes in this potentially compound page. */
+static inline unsigned long page_size(struct page *page)
+{
+ return PAGE_SIZE << compound_order(page);
+}
+
+/* Returns the number of bits needed for the number of bytes in a page */
+static inline unsigned int page_shift(struct page *page)
+{
+ return PAGE_SHIFT + compound_order(page);
+}
+
void free_compound_page(struct page *page);
#ifdef CONFIG_MMU
@@ -1057,8 +1075,9 @@ static inline void put_user_page(struct page *page)
put_page(page);
}
-void put_user_pages_dirty(struct page **pages, unsigned long npages);
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty);
+
void put_user_pages(struct page **pages, unsigned long npages);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -2305,6 +2324,8 @@ extern int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
+unsigned long randomize_stack_top(unsigned long stack_top);
+
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
extern unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2568,6 +2589,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
/*
* NOTE on FOLL_LONGTERM:
@@ -2845,5 +2867,12 @@ void __init setup_nr_node_ids(void);
static inline void setup_nr_node_ids(void) {}
#endif
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b739f360cec..25395481d2ae 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -137,7 +137,9 @@ struct page {
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
- unsigned long _compound_pad_2;
+ /* Freeable normal pages for deferred split shrinker */
+ unsigned long nr_freeable;
+ /* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index d7016dcb245e..c1bc6731125c 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -36,6 +36,10 @@ struct vmacache {
struct vm_area_struct *vmas[VMACACHE_SIZE];
};
+/*
+ * When updating this, please also update struct resident_page_types[] in
+ * kernel/fork.c
+ */
enum {
MM_FILEPAGES, /* Resident file mapping pages */
MM_ANONPAGES, /* Resident anonymous pages */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d77d717c620c..bda20282746b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -215,8 +215,9 @@ enum node_stat_item {
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
- NR_SLAB_RECLAIMABLE,
- NR_SLAB_UNRECLAIMABLE,
+ NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */
+ NR_SLAB_UNRECLAIMABLE, /* and this one without looking at
+ * memcg_flush_percpu_vmstats() first. */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
WORKINGSET_NODES,
@@ -234,6 +235,8 @@ enum node_stat_item {
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
+ NR_FILE_THPS,
+ NR_FILE_PMDMAPPED,
NR_ANON_THPS,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_VMSCAN_WRITE,
@@ -676,6 +679,14 @@ struct zonelist {
extern struct page *mem_map;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct deferred_split {
+ spinlock_t split_queue_lock;
+ struct list_head split_queue;
+ unsigned long split_queue_len;
+};
+#endif
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
@@ -755,9 +766,7 @@ typedef struct pglist_data {
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- spinlock_t split_queue_lock;
- struct list_head split_queue;
- unsigned long split_queue_len;
+ struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 09592951725c..682fd465df06 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -18,6 +18,7 @@ struct page_ext_operations {
enum page_ext_flags {
PAGE_EXT_OWNER,
+ PAGE_EXT_OWNER_ACTIVE,
#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c7552459a15f..37a4d9e32cd3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+ if (PageHuge(page))
+ return page;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ return page + (offset & (compound_nr(page) - 1));
+}
+
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
diff --git a/include/linux/printk.h b/include/linux/printk.h
index cefd374c47b1..c09d67edda3a 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -488,13 +488,6 @@ extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
extern void print_hex_dump(const char *level, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
const void *buf, size_t len, bool ascii);
-#if defined(CONFIG_DYNAMIC_DEBUG)
-#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \
- dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true)
-#else
-extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
- const void *buf, size_t len);
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
@@ -526,4 +519,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
}
#endif
+/**
+ * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
+ * @prefix_str: string to prefix each line with;
+ * caller supplies trailing spaces for alignment if desired
+ * @prefix_type: controls whether prefix of an offset, address, or none
+ * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
+ * @buf: data blob to dump
+ * @len: number of bytes in the @buf
+ *
+ * Calls print_hex_dump(), with log level of KERN_DEBUG,
+ * rowsize of 16, groupsize of 1, and ASCII output included.
+ */
+#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \
+ print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)
+
#endif
diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h
deleted file mode 100644
index 034982c98c8b..000000000000
--- a/include/linux/quicklist.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_QUICKLIST_H
-#define LINUX_QUICKLIST_H
-/*
- * Fast allocations and disposal of pages. Pages must be in the condition
- * as needed after allocation when they are freed. Per cpu lists of pages
- * are kept that only contain node local pages.
- *
- * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/percpu.h>
-
-#ifdef CONFIG_QUICKLIST
-
-struct quicklist {
- void *page;
- int nr_pages;
-};
-
-DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
-
-/*
- * The two key functions quicklist_alloc and quicklist_free are inline so
- * that they may be custom compiled for the platform.
- * Specifying a NULL ctor can remove constructor support. Specifying
- * a constant quicklist allows the determination of the exact address
- * in the per cpu area.
- *
- * The fast patch in quicklist_alloc touched only a per cpu cacheline and
- * the first cacheline of the page itself. There is minmal overhead involved.
- */
-static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
-{
- struct quicklist *q;
- void **p = NULL;
-
- q =&get_cpu_var(quicklist)[nr];
- p = q->page;
- if (likely(p)) {
- q->page = p[0];
- p[0] = NULL;
- q->nr_pages--;
- }
- put_cpu_var(quicklist);
- if (likely(p))
- return p;
-
- p = (void *)__get_free_page(flags | __GFP_ZERO);
- if (ctor && p)
- ctor(p);
- return p;
-}
-
-static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p,
- struct page *page)
-{
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- *(void **)p = q->page;
- q->page = p;
- q->nr_pages++;
- put_cpu_var(quicklist);
-}
-
-static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
-{
- __quicklist_free(nr, dtor, pp, virt_to_page(pp));
-}
-
-static inline void quicklist_free_page(int nr, void (*dtor)(void *),
- struct page *page)
-{
- __quicklist_free(nr, dtor, page_address(page), page);
-}
-
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free);
-
-unsigned long quicklist_total_size(void);
-
-#else
-
-static inline unsigned long quicklist_total_size(void)
-{
- return 0;
-}
-
-#endif
-
-#endif /* LINUX_QUICKLIST_H */
-
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 179faab29f52..fdd421b8d9ae 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -60,41 +60,87 @@ rb_insert_augmented_cached(struct rb_node *node,
rb_insert_augmented(node, &root->rb_root, augment);
}
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
- rbtype, rbaugmented, rbcompute) \
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC: 'static' or empty
+ * RBNAME: name of the rb_augment_callbacks structure
+ * RBSTRUCT: struct type of the tree nodes
+ * RBFIELD: name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
+ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
static inline void \
-rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
{ \
while (rb != stop) { \
- rbstruct *node = rb_entry(rb, rbstruct, rbfield); \
- rbtype augmented = rbcompute(node); \
- if (node->rbaugmented == augmented) \
+ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
+ if (RBCOMPUTE(node, true)) \
break; \
- node->rbaugmented = augmented; \
- rb = rb_parent(&node->rbfield); \
+ rb = rb_parent(&node->RBFIELD); \
} \
} \
static inline void \
-rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
- rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
- rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
- new->rbaugmented = old->rbaugmented; \
+ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
+ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
+ new->RBAUGMENTED = old->RBAUGMENTED; \
} \
static void \
-rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
- rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
- rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
- new->rbaugmented = old->rbaugmented; \
- old->rbaugmented = rbcompute(old); \
+ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
+ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
+ new->RBAUGMENTED = old->RBAUGMENTED; \
+ RBCOMPUTE(old, false); \
} \
-rbstatic const struct rb_augment_callbacks rbname = { \
- .propagate = rbname ## _propagate, \
- .copy = rbname ## _copy, \
- .rotate = rbname ## _rotate \
+RBSTATIC const struct rb_augment_callbacks RBNAME = { \
+ .propagate = RBNAME ## _propagate, \
+ .copy = RBNAME ## _copy, \
+ .rotate = RBNAME ## _rotate \
};
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC: 'static' or empty
+ * RBNAME: name of the rb_augment_callbacks structure
+ * RBSTRUCT: struct type of the tree nodes
+ * RBFIELD: name of struct rb_node field within RBSTRUCT
+ * RBTYPE: type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
+ RBTYPE, RBAUGMENTED, RBCOMPUTE) \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
+{ \
+ RBSTRUCT *child; \
+ RBTYPE max = RBCOMPUTE(node); \
+ if (node->RBFIELD.rb_left) { \
+ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
+ if (child->RBAUGMENTED > max) \
+ max = child->RBAUGMENTED; \
+ } \
+ if (node->RBFIELD.rb_right) { \
+ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
+ if (child->RBAUGMENTED > max) \
+ max = child->RBAUGMENTED; \
+ } \
+ if (exit && node->RBAUGMENTED == max) \
+ return true; \
+ node->RBAUGMENTED = max; \
+ return false; \
+} \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
+ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
#define RB_RED 0
#define RB_BLACK 1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b75b28287005..50a365f9251c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -899,6 +899,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1238,6 +1239,7 @@ struct task_struct {
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
+ struct list_head oom_victim_list;
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
@@ -1258,6 +1260,13 @@ struct task_struct {
unsigned long prev_lowest_stack;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d4f6215ee03f..89f55e914673 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -12,6 +12,7 @@ extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_check_interval_secs;
extern int sysctl_hung_task_warnings;
+extern int sysctl_hung_task_interval_warnings;
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 9443cafd1969..0f80123650e2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -69,7 +69,7 @@ struct shrinker {
/* These are for internal use */
struct list_head list;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
/* ID in shrinker_idr */
int id;
#endif
@@ -81,6 +81,11 @@ struct shrinker {
/* Flags */
#define SHRINKER_NUMA_AWARE (1 << 0)
#define SHRINKER_MEMCG_AWARE (1 << 1)
+/*
+ * It just makes sense when the shrinker is also MEMCG_AWARE for now,
+ * non-MEMCG_AWARE shrinker should not have this flag set.
+ */
+#define SHRINKER_NONSLAB (1 << 2)
extern int prealloc_shrinker(struct shrinker *shrinker);
extern void register_shrinker_prepared(struct shrinker *shrinker);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 56c9c7eed34e..4d2a2fa55ed5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
* kmalloc is the normal method of allocating memory
* for objects smaller than page size in the kernel.
*
+ * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
+ * bytes. For @size of power of two bytes, the alignment is also guaranteed
+ * to be at least to the size.
+ *
* The @flags argument may be one of the GFP flags defined at
* include/linux/gfp.h and described at
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
@@ -595,68 +599,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}
-struct memcg_cache_array {
- struct rcu_head rcu;
- struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache: Common to root and child caches. NULL for root, pointer to
- * the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches. This table is
- * used to index child cachces during allocation and cleared
- * early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children: List of all child caches. While the child caches are also
- * reachable through @memcg_caches, a child cache remains on
- * this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg: Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
- struct kmem_cache *root_cache;
- union {
- struct {
- struct memcg_cache_array __rcu *memcg_caches;
- struct list_head __root_caches_node;
- struct list_head children;
- bool dying;
- };
- struct {
- struct mem_cgroup *memcg;
- struct list_head children_node;
- struct list_head kmem_caches_node;
- struct percpu_ref refcnt;
-
- void (*work_fn)(struct kmem_cache *);
- union {
- struct rcu_head rcu_head;
- struct work_struct work;
- };
- };
- };
-};
-
int memcg_update_all_caches(int num_memcgs);
/**
diff --git a/include/linux/string.h b/include/linux/string.h
index 4deb11f7976b..3cf684db4bc6 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -35,6 +35,51 @@ ssize_t strscpy(char *, const char *, size_t);
/* Wraps calls to strscpy()/memset(), no arch specific code required */
ssize_t strscpy_pad(char *dest, const char *src, size_t count);
+/**
+ * stracpy - Copy a C-string into an array of char/u8/s8 or equivalent
+ * @dest: Where to copy the string, must be an array of char and not a pointer
+ * @src: String to copy, may be a pointer or const char array
+ *
+ * Helper for strscpy().
+ * Copies a maximum of sizeof(@dest) bytes of @src with %NUL termination.
+ *
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if @dest is a zero size array or @src was truncated.
+ */
+#define stracpy(dest, src) \
+({ \
+ size_t count = ARRAY_SIZE(dest); \
+ BUILD_BUG_ON(!(__same_type(dest, char[]) || \
+ __same_type(dest, unsigned char[]) || \
+ __same_type(dest, signed char[]))); \
+ \
+ strscpy(dest, src, count); \
+})
+
+/**
+ * stracpy_pad - Copy a C-string into an array of char/u8/s8 with %NUL padding
+ * @dest: Where to copy the string, must be an array of char and not a pointer
+ * @src: String to copy, may be a pointer or const char array
+ *
+ * Helper for strscpy_pad().
+ * Copies a maximum of sizeof(@dest) bytes of @src with %NUL termination
+ * and zero-pads the remaining size of @dest
+ *
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if @dest is a zero size array or @src was truncated.
+ */
+#define stracpy_pad(dest, src) \
+({ \
+ size_t count = ARRAY_SIZE(dest); \
+ BUILD_BUG_ON(!(__same_type(dest, char[]) || \
+ __same_type(dest, unsigned char[]) || \
+ __same_type(dest, signed char[]))); \
+ \
+ strscpy_pad(dest, src, count); \
+})
+
#ifndef __HAVE_ARCH_STRCAT
extern char * strcat(char *, const char *);
#endif
@@ -474,8 +519,9 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len,
* But this can lead to bugs due to typos, or if prefix is a pointer
* and not a constant. Instead use str_has_prefix().
*
- * Returns: 0 if @str does not start with @prefix
- strlen(@prefix) if @str does start with @prefix
+ * Returns:
+ * * strlen(@prefix) if @str starts with @prefix
+ * * 0 if @str does not start with @prefix
*/
static __always_inline size_t str_has_prefix(const char *str, const char *prefix)
{
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a33b7e..063c0c1e112b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
@@ -364,6 +365,7 @@ extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
+extern unsigned long reclaim_pages(struct list_head *page_list);
#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 9b21d0047710..a1334bd18ef1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -51,15 +51,21 @@ struct vmap_area {
unsigned long va_start;
unsigned long va_end;
- /*
- * Largest available free size in subtree.
- */
- unsigned long subtree_max_size;
- unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
- struct llist_node purge_list; /* "lazy purge" list */
- struct vm_struct *vm;
+
+ /*
+ * The following three variables can be packed, because
+ * a vmap_area object is always one of the three states:
+ * 1) in "free" tree (root is vmap_area_root)
+ * 2) in "busy" tree (root is free_vmap_area_root)
+ * 3) in purge list (head is vmap_purge_list)
+ */
+ union {
+ unsigned long subtree_max_size; /* in "free" tree */
+ struct vm_struct *vm; /* in "busy" tree */
+ struct llist_node purge_list; /* in purge list */
+ };
};
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..3647d1008d88 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -540,7 +540,7 @@ do { \
({ \
int __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
@@ -566,7 +566,7 @@ do { \
({ \
long __ret = 0; \
might_sleep(); \
- if (!(condition)) \
+ if (!(condition) && (timeout)) \
__ret = __wait_event_hrtimeout(wq, condition, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 7238865e75b0..51bf43076165 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool);
void zpool_destroy_pool(struct zpool *pool);
+bool zpool_malloc_support_movable(struct zpool *pool);
+
int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
@@ -90,6 +92,7 @@ struct zpool_driver {
struct zpool *zpool);
void (*destroy)(void *pool);
+ bool malloc_support_movable;
int (*malloc)(void *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void (*free)(void *pool, unsigned long handle);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index a5ab2973e8dc..c37e2280e6dd 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -127,18 +127,43 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_b
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_begin_template,
- TP_PROTO(int order, gfp_t gfp_flags),
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
- TP_ARGS(order, gfp_flags)
+ TP_ARGS(cgroup_ino, order, gfp_flags),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cgroup_ino)
+ __field(int, order)
+ __field(gfp_t, gfp_flags)
+ ),
+
+ TP_fast_assign(
+ __entry->cgroup_ino = cgroup_ino;
+ __entry->order = order;
+ __entry->gfp_flags = gfp_flags;
+ ),
+
+ TP_printk("cgroup_ino=%u order=%d gfp_flags=%s",
+ __entry->cgroup_ino, __entry->order,
+ show_gfp_flags(__entry->gfp_flags))
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+ mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, gfp_t gfp_flags),
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
- TP_ARGS(order, gfp_flags)
+ TP_ARGS(cgroup_ino, order, gfp_flags)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template,
+ mm_vmscan_memcg_softlimit_reclaim_begin,
+
+ TP_PROTO(unsigned int cgroup_ino, int order, gfp_t gfp_flags),
+
+ TP_ARGS(cgroup_ino, order, gfp_flags)
);
#endif /* CONFIG_MEMCG */
@@ -167,18 +192,40 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end
);
#ifdef CONFIG_MEMCG
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
+DECLARE_EVENT_CLASS(mm_vmscan_memcg_reclaim_end_template,
- TP_PROTO(unsigned long nr_reclaimed),
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
- TP_ARGS(nr_reclaimed)
+ TP_ARGS(cgroup_ino, nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cgroup_ino)
+ __field(unsigned long, nr_reclaimed)
+ ),
+
+ TP_fast_assign(
+ __entry->cgroup_ino = cgroup_ino;
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("cgroup_ino=%u nr_reclaimed=%lu",
+ __entry->cgroup_ino, __entry->nr_reclaimed)
);
-DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+ mm_vmscan_memcg_reclaim_end,
- TP_PROTO(unsigned long nr_reclaimed),
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
- TP_ARGS(nr_reclaimed)
+ TP_ARGS(cgroup_ino, nr_reclaimed)
+);
+
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template,
+ mm_vmscan_memcg_softlimit_reclaim_end,
+
+ TP_PROTO(unsigned int cgroup_ino, unsigned long nr_reclaimed),
+
+ TP_ARGS(cgroup_ino, nr_reclaimed)
);
#endif /* CONFIG_MEMCG */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index aa7f3aeac740..79095434c1be 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -66,8 +66,9 @@ DECLARE_EVENT_CLASS(writeback_page_template,
),
TP_fast_assign(
- strncpy(__entry->name,
- mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
+ strscpy_pad(__entry->name,
+ mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)",
+ 32);
__entry->ino = mapping ? mapping->host->i_ino : 0;
__entry->index = page->index;
),
@@ -110,8 +111,8 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
struct backing_dev_info *bdi = inode_to_bdi(inode);
/* may be called for files on pseudo FSes w/ unregistered bdi */
- strncpy(__entry->name,
- bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+ strscpy_pad(__entry->name,
+ bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
__entry->ino = inode->i_ino;
__entry->state = inode->i_state;
__entry->flags = flags;
@@ -190,8 +191,8 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
),
TP_fast_assign(
- strncpy(__entry->name,
- dev_name(inode_to_bdi(inode)->dev), 32);
+ strscpy_pad(__entry->name,
+ dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->sync_mode = wbc->sync_mode;
__entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc);
@@ -234,8 +235,9 @@ DECLARE_EVENT_CLASS(writeback_work_class,
__field(unsigned int, cgroup_ino)
),
TP_fast_assign(
- strncpy(__entry->name,
- wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
+ strscpy_pad(__entry->name,
+ wb->bdi->dev ? dev_name(wb->bdi->dev) :
+ "(unknown)", 32);
__entry->nr_pages = work->nr_pages;
__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
__entry->sync_mode = work->sync_mode;
@@ -288,7 +290,7 @@ DECLARE_EVENT_CLASS(writeback_class,
__field(unsigned int, cgroup_ino)
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
),
TP_printk("bdi %s: cgroup_ino=%u",
@@ -310,7 +312,7 @@ TRACE_EVENT(writeback_bdi_register,
__array(char, name, 32)
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
),
TP_printk("bdi %s",
__entry->name
@@ -335,7 +337,7 @@ DECLARE_EVENT_CLASS(wbc_class,
),
TP_fast_assign(
- strncpy(__entry->name, dev_name(bdi->dev), 32);
+ strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
__entry->nr_to_write = wbc->nr_to_write;
__entry->pages_skipped = wbc->pages_skipped;
__entry->sync_mode = wbc->sync_mode;
@@ -386,7 +388,7 @@ TRACE_EVENT(writeback_queue_io,
),
TP_fast_assign(
unsigned long *older_than_this = work->older_than_this;
- strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+ strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
__entry->older = older_than_this ? *older_than_this : 0;
__entry->age = older_than_this ?
(jiffies - *older_than_this) * 1000 / HZ : -1;
@@ -472,7 +474,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
),
TP_fast_assign(
- strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+ strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
__entry->write_bw = KBps(wb->write_bandwidth);
__entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
__entry->dirty_rate = KBps(dirty_rate);
@@ -537,7 +539,7 @@ TRACE_EVENT(balance_dirty_pages,
TP_fast_assign(
unsigned long freerun = (thresh + bg_thresh) / 2;
- strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+ strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
__entry->limit = global_wb_domain.dirty_limit;
__entry->setpoint = (global_wb_domain.dirty_limit +
@@ -597,8 +599,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
),
TP_fast_assign(
- strncpy(__entry->name,
- dev_name(inode_to_bdi(inode)->dev), 32);
+ strscpy_pad(__entry->name,
+ dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->state = inode->i_state;
__entry->dirtied_when = inode->dirtied_when;
@@ -671,8 +673,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
),
TP_fast_assign(
- strncpy(__entry->name,
- dev_name(inode_to_bdi(inode)->dev), 32);
+ strscpy_pad(__entry->name,
+ dev_name(inode_to_bdi(inode)->dev), 32);
__entry->ino = inode->i_ino;
__entry->state = inode->i_state;
__entry->dirtied_when = inode->dirtied_when;
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 63b1f506ea67..c160a5354eb6 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -67,6 +67,9 @@
#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */
+#define MADV_COLD 20 /* deactivate these pages */
+#define MADV_PAGEOUT 21 /* reclaim these pages */
+
/* compatibility flags */
#define MAP_FILE 0
diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h
index e4a79f80b9a0..ab5c7e847eed 100644
--- a/include/uapi/linux/coff.h
+++ b/include/uapi/linux/coff.h
@@ -11,6 +11,9 @@
more information about COFF, then O'Reilly has a very excellent book.
*/
+#ifndef _UAPI_LINUX_COFF_H
+#define _UAPI_LINUX_COFF_H
+
#define E_SYMNMLEN 8 /* Number of characters in a symbol name */
#define E_FILNMLEN 14 /* Number of characters in a file name */
#define E_DIMNUM 4 /* Number of array dimensions in auxiliary entry */
@@ -350,3 +353,5 @@ struct COFF_reloc {
/* For new sections we haven't heard of before */
#define COFF_DEF_SECTION_ALIGNMENT 4
+
+#endif /* _UAPI_LINUX_COFF_H */
diff --git a/init/main.c b/init/main.c
index 166768420dbd..91f6ebb30ef0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { }
void __init __weak poking_init(void) { }
-void __init __weak pgd_cache_init(void) { }
+void __init __weak pgtable_cache_init(void) { }
bool initcall_debug;
core_param(initcall_debug, initcall_debug, bool, 0644);
@@ -556,6 +556,7 @@ static void __init mm_init(void)
report_meminit();
mem_init();
kmem_cache_init();
+ kmemleak_init();
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
@@ -564,7 +565,6 @@ static void __init mm_init(void)
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
pti_init();
- pgd_cache_init();
}
void __init __weak arch_call_rest_init(void)
@@ -595,7 +595,6 @@ asmlinkage __visible void __init start_kernel(void)
pr_notice("%s", linux_banner);
early_security_init();
setup_arch(&command_line);
- mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
@@ -741,7 +740,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
#endif
- kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
acpi_early_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7a5a8edc3de3..ad6c475eb370 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1241,15 +1241,14 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
/* create the notify skb */
nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
- if (!nc) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!nc)
+ return -ENOMEM;
+
if (copy_from_user(nc->data,
notification->sigev_value.sival_ptr,
NOTIFY_COOKIE_LEN)) {
ret = -EFAULT;
- goto out;
+ goto free_skb;
}
/* TODO: add a header? */
@@ -1265,8 +1264,7 @@ retry:
fdput(f);
if (IS_ERR(sock)) {
ret = PTR_ERR(sock);
- sock = NULL;
- goto out;
+ goto free_skb;
}
timeo = MAX_SCHEDULE_TIMEOUT;
@@ -1275,11 +1273,8 @@ retry:
sock = NULL;
goto retry;
}
- if (ret) {
- sock = NULL;
- nc = NULL;
- goto out;
- }
+ if (ret)
+ return ret;
}
}
@@ -1334,7 +1329,8 @@ out_fput:
out:
if (sock)
netlink_detachskb(sock, nc);
- else if (nc)
+ else
+free_skb:
dev_kfree_skb(nc);
return ret;
diff --git a/ipc/msg.c b/ipc/msg.c
index 8dec945fa030..3fe7400b9c43 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -377,7 +377,7 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
* NOTE: no locks must be held, the rwsem is taken inside this function.
*/
static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
- struct msqid64_ds *msqid64)
+ struct ipc64_perm *perm, int msg_qbytes)
{
struct kern_ipc_perm *ipcp;
struct msg_queue *msq;
@@ -387,7 +387,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
rcu_read_lock();
ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd,
- &msqid64->msg_perm, msqid64->msg_qbytes);
+ perm, msg_qbytes);
if (IS_ERR(ipcp)) {
err = PTR_ERR(ipcp);
goto out_unlock1;
@@ -409,18 +409,18 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
{
DEFINE_WAKE_Q(wake_q);
- if (msqid64->msg_qbytes > ns->msg_ctlmnb &&
+ if (msg_qbytes > ns->msg_ctlmnb &&
!capable(CAP_SYS_RESOURCE)) {
err = -EPERM;
goto out_unlock1;
}
ipc_lock_object(&msq->q_perm);
- err = ipc_update_perm(&msqid64->msg_perm, ipcp);
+ err = ipc_update_perm(perm, ipcp);
if (err)
goto out_unlock0;
- msq->q_qbytes = msqid64->msg_qbytes;
+ msq->q_qbytes = msg_qbytes;
msq->q_ctime = ktime_get_real_seconds();
/*
@@ -601,9 +601,9 @@ static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int ver
case IPC_SET:
if (copy_msqid_from_user(&msqid64, buf, version))
return -EFAULT;
- /* fallthru */
+ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes);
case IPC_RMID:
- return msgctl_down(ns, msqid, cmd, &msqid64);
+ return msgctl_down(ns, msqid, cmd, NULL, 0);
default:
return -EINVAL;
}
@@ -735,9 +735,9 @@ static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int versio
case IPC_SET:
if (copy_compat_msqid_from_user(&msqid64, uptr, version))
return -EFAULT;
- /* fallthru */
+ return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes);
case IPC_RMID:
- return msgctl_down(ns, msqid, cmd, &msqid64);
+ return msgctl_down(ns, msqid, cmd, NULL, 0);
default:
return -EINVAL;
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..b26bf06cff9e 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -896,30 +896,25 @@ static struct sysrq_key_op sysrq_dbg_op = {
};
#endif
-static int kgdb_panic_event(struct notifier_block *self,
- unsigned long val,
- void *data)
+void kgdb_panic(const char *msg)
{
+ if (!kgdb_io_module_registered)
+ return;
+
/*
- * Avoid entering the debugger if we were triggered due to a panic
- * We don't want to get stuck waiting for input from user in such case.
- * panic_timeout indicates the system should automatically
+ * We don't want to get stuck waiting for input from user if
+ * "panic_timeout" indicates the system should automatically
* reboot on panic.
*/
if (panic_timeout)
- return NOTIFY_DONE;
+ return;
if (dbg_kdb_mode)
- kdb_printf("PANIC: %s\n", (char *)data);
+ kdb_printf("PANIC: %s\n", msg);
+
kgdb_breakpoint();
- return NOTIFY_DONE;
}
-static struct notifier_block kgdb_panic_event_nb = {
- .notifier_call = kgdb_panic_event,
- .priority = INT_MAX,
-};
-
void __weak kgdb_arch_late(void)
{
}
@@ -968,8 +963,6 @@ static void kgdb_register_callbacks(void)
kgdb_arch_late();
register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
- atomic_notifier_chain_register(&panic_notifier_list,
- &kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
register_sysrq_key('g', &sysrq_dbg_op);
#endif
@@ -983,16 +976,14 @@ static void kgdb_register_callbacks(void)
static void kgdb_unregister_callbacks(void)
{
/*
- * When this routine is called KGDB should unregister from the
- * panic handler and clean up, making sure it is not handling any
+ * When this routine is called KGDB should unregister from
+ * handlers and clean up, making sure it is not handling any
* break exceptions at the time.
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
unregister_module_notifier(&dbg_module_load_nb);
- atomic_notifier_chain_unregister(&panic_notifier_list,
- &kgdb_panic_event_nb);
kgdb_arch_exit();
#ifdef CONFIG_MAGIC_SYSRQ
unregister_sysrq_key('g', &sysrq_dbg_op);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 838123f79639..9d11820c9ca7 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -172,7 +172,7 @@ bool dma_in_atomic_pool(void *start, size_t size)
if (unlikely(!atomic_pool))
return false;
- return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+ return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
}
void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index fc482c8e0bd8..57fb4dcff434 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/binfmts.h>
+#include <linux/elfcore.h>
Elf_Half __weak elf_core_extra_phdrs(void)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa00497c49..94d38a39d72e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
#include <linux/uprobes.h>
@@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
*
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
- .page = old_page,
+ .page = compound_head(old_page),
.vma = vma,
.address = addr,
};
@@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
- VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
- err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
- false);
- if (err)
- return err;
+ if (new_page) {
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false);
+ if (err)
+ return err;
+ }
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
@@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw)) {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ if (new_page)
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- get_page(new_page);
- page_add_new_anon_rmap(new_page, vma, addr, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ if (new_page) {
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
+ } else
+ /* no new page, just dec_mm_counter for old_page */
+ dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (new_page)
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
@@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
+ bool orig_page_huge = false;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
@@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
retry:
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(NULL, mm, vaddr, 1,
- FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+ FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -488,6 +498,10 @@ retry:
ref_ctr_updated = 1;
}
+ ret = 0;
+ if (!is_register && !PageAnon(old_page))
+ goto put_old;
+
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
@@ -501,8 +515,33 @@ retry:
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (!is_register) {
+ struct page *orig_page;
+ pgoff_t index;
+
+ VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+ index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+ index);
+
+ if (orig_page) {
+ if (PageUptodate(orig_page) &&
+ pages_identical(new_page, orig_page)) {
+ /* let go new_page */
+ put_page(new_page);
+ new_page = NULL;
+
+ if (PageCompound(orig_page))
+ orig_page_huge = true;
+ }
+ put_page(orig_page);
+ }
+ }
+
ret = __replace_page(vma, vaddr, old_page, new_page);
- put_page(new_page);
+ if (new_page)
+ put_page(new_page);
put_old:
put_page(old_page);
@@ -513,6 +552,10 @@ put_old:
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
+ /* try collapse pmd for compound page */
+ if (!ret && orig_page_huge)
+ collapse_pte_mapped_thp(mm, vaddr);
+
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b0f733470bed..f601168f6b21 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -130,6 +130,15 @@ int nr_threads; /* The idle threads do not count.. */
static int max_threads; /* tunable limit on nr_threads */
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
+
+static const char * const resident_page_types[] = {
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
+
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
@@ -650,12 +659,15 @@ static void check_mm(struct mm_struct *mm)
{
int i;
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
+
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = atomic_long_read(&mm->rss_stat.count[i]);
if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+ mm, resident_page_types[i], x);
}
if (mm_pgtables_bytes(mm))
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c16cb3..f1443abb31f1 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -49,6 +49,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
int __read_mostly sysctl_hung_task_warnings = 10;
+int __read_mostly sysctl_hung_task_interval_warnings;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
@@ -85,6 +86,34 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+static void hung_task_warning(struct task_struct *t, bool timeout)
+{
+ const char *loglevel = timeout ? KERN_ERR : KERN_INFO;
+ const char *path;
+ int *warnings;
+
+ if (timeout) {
+ warnings = &sysctl_hung_task_warnings;
+ path = "hung_task_timeout_secs";
+ } else {
+ warnings = &sysctl_hung_task_interval_warnings;
+ path = "hung_task_interval_secs";
+ }
+
+ if (*warnings > 0)
+ --*warnings;
+
+ printk("%sINFO: task %s:%d blocked for more than %ld seconds.\n",
+ loglevel, t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
+ printk("%s %s %s %.*s\n",
+ loglevel, print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ printk("%s\"echo 0 > /proc/sys/kernel/%s\" disables this message.\n",
+ loglevel, path);
+ sched_show_task(t);
+}
+
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
@@ -109,6 +138,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
t->last_switch_time = jiffies;
return;
}
+ if (sysctl_hung_task_interval_warnings)
+ hung_task_warning(t, false);
+
if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
return;
@@ -120,28 +152,57 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
hung_task_call_panic = true;
}
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
if (sysctl_hung_task_warnings) {
- if (sysctl_hung_task_warnings > 0)
- sysctl_hung_task_warnings--;
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
- pr_err(" %s %s %.*s\n",
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
+ /* Don't print warings twice */
+ if (!sysctl_hung_task_interval_warnings)
+ hung_task_warning(t, true);
hung_task_show_lock = true;
}
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -193,6 +254,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5870723b8ad..15d70a90b50d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
+ if (fatal_signal_pending(current))
+ return NULL;
pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
if (pages) {
unsigned int count, i;
diff --git a/kernel/panic.c b/kernel/panic.c
index 057540b6eee9..47e8ebccc22b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -12,6 +12,7 @@
#include <linux/debug_locks.h>
#include <linux/sched/debug.h>
#include <linux/interrupt.h>
+#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
@@ -220,6 +221,13 @@ void panic(const char *fmt, ...)
#endif
/*
+ * If kgdb is enabled, give it a chance to run before we stop all
+ * the other CPUs or else we won't be able to debug processes left
+ * running on them.
+ */
+ kgdb_panic(buf);
+
+ /*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
@@ -551,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- if (args)
- pr_warn(CUT_HERE);
-
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
raw_smp_processor_id(), current->pid, file, line,
@@ -591,37 +596,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
add_taint(taint, LOCKDEP_STILL_OK);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
struct warn_args args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
- &args);
- va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
+ pr_warn(CUT_HERE);
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
-{
- struct warn_args args;
+ if (!fmt) {
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
- pr_warn(CUT_HERE);
- __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
#else
void __warn_printk(const char *fmt, ...)
{
diff --git a/kernel/resource.c b/kernel/resource.c
index 74877e9d90ca..76036a41143b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
false, &res)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8bfeb6395bdd..e2a8308f9442 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -238,7 +238,6 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- check_pgt_cache();
rmb();
local_irq_disable();
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 6e52b67b420e..fb4b7da56205 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -832,6 +832,7 @@ void psi_memstall_enter(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_enter);
/**
* psi_memstall_leave - mark the end of an memory stall section
@@ -861,6 +862,7 @@ void psi_memstall_leave(unsigned long *flags)
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d9605b..d19941a21d0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[];
extern struct ctl_table firmware_config_table[];
#endif
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif
@@ -1147,6 +1148,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &neg_one,
},
+ {
+ .procname = "hung_task_interval_warnings",
+ .data = &sysctl_hung_task_interval_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &neg_one,
+ },
#endif
#ifdef CONFIG_RT_MUTEXES
{
@@ -1573,7 +1582,8 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec,
.extra1 = SYSCTL_ZERO,
},
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
{
.procname = "legacy_va_layout",
.data = &sysctl_legacy_va_layout,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a425741907b0..10023dbac8e4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -592,17 +592,18 @@ config DEBUG_KMEMLEAK
In order to access the kmemleak file, debugfs needs to be
mounted (usually at /sys/kernel/debug).
-config DEBUG_KMEMLEAK_EARLY_LOG_SIZE
- int "Maximum kmemleak early log entries"
+config DEBUG_KMEMLEAK_MEM_POOL_SIZE
+ int "Kmemleak memory pool size"
depends on DEBUG_KMEMLEAK
- range 200 40000
- default 400
+ range 200 1000000
+ default 16000
help
Kmemleak must track all the memory allocations to avoid
reporting false positives. Since memory may be allocated or
- freed before kmemleak is initialised, an early log buffer is
- used to store these actions. If kmemleak reports "early log
- buffer exceeded", please increase this value.
+ freed before kmemleak is fully initialised, use a static pool
+ of metadata objects to track such callbacks. After kmemleak is
+ fully initialised, this memory pool acts as an emergency one
+ if slab allocations fail.
config DEBUG_KMEMLEAK_TEST
tristate "Simple test for the kernel memory leak detector"
@@ -2153,6 +2154,12 @@ config IO_STRICT_DEVMEM
If in doubt, say Y.
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu # Kernel hacking
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 7fa97a8b5717..6c9682ce0254 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING
to 3TB of RAM with KASan enabled). This options allows to force
4-level paging instead.
+config KASAN_SW_TAGS_IDENTIFY
+ bool "Enable memory corruption identification"
+ depends on KASAN_SW_TAGS
+ help
+ This option enables best-effort identification of bug type
+ (use-after-free or out-of-bounds) at the cost of increased
+ memory consumption.
+
config TEST_KASAN
tristate "Module for testing KASAN for bug detection"
depends on m && KASAN
diff --git a/lib/bug.c b/lib/bug.c
index 1077366f496b..8c98af0bf585 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -181,6 +181,15 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
}
}
+ /*
+ * BUG() and WARN_ON() families don't print a custom debug message
+ * before triggering the exception handler, so we must add the
+ * "cut here" line now. WARN() issues its own "cut here" before the
+ * extra debugging message it writes before triggering the handler.
+ */
+ if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
+ printk(KERN_DEFAULT CUT_HERE);
+
if (warning) {
/* this is a WARN_ON rather than BUG/BUG_ON */
__warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs,
@@ -188,8 +197,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
return BUG_TRAP_TYPE_WARN;
}
- printk(KERN_DEFAULT CUT_HERE);
-
if (file)
pr_crit("kernel BUG at %s:%u!\n", file, line);
else
diff --git a/lib/extable.c b/lib/extable.c
index 25da4071122a..c3e59caf7ffa 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
+#include <linux/extable.h>
#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x) ((x)->insn)
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 9fc31292cfa1..e43d6107fd62 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -540,7 +540,7 @@ void gen_pool_for_each_chunk(struct gen_pool *pool,
EXPORT_SYMBOL(gen_pool_for_each_chunk);
/**
- * addr_in_gen_pool - checks if an address falls within the range of a pool
+ * gen_pool_has_addr - checks if an address falls within the range of a pool
* @pool: the generic memory pool
* @start: start address
* @size: size of the region
@@ -548,7 +548,7 @@ EXPORT_SYMBOL(gen_pool_for_each_chunk);
* Check if the range of addresses falls within the specified pool. Returns
* true if the entire range is contained in the pool and false otherwise.
*/
-bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
+bool gen_pool_has_addr(struct gen_pool *pool, unsigned long start,
size_t size)
{
bool found = false;
@@ -567,6 +567,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
rcu_read_unlock();
return found;
}
+EXPORT_SYMBOL(gen_pool_has_addr);
/**
* gen_pool_avail - get available free space of the pool
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index a7bafc413730..ae25e2fa2187 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -36,12 +36,12 @@ static inline size_t genradix_depth_size(unsigned depth)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-unsigned genradix_root_to_depth(struct genradix_root *r)
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
-struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
diff --git a/lib/hexdump.c b/lib/hexdump.c
index b1d55b669ae2..147133f8eb2f 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -270,25 +270,4 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
}
EXPORT_SYMBOL(print_hex_dump);
-#if !defined(CONFIG_DYNAMIC_DEBUG)
-/**
- * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
- * @prefix_str: string to prefix each line with;
- * caller supplies trailing spaces for alignment if desired
- * @prefix_type: controls whether prefix of an offset, address, or none
- * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
- * @buf: data blob to dump
- * @len: number of bytes in the @buf
- *
- * Calls print_hex_dump(), with log level of KERN_DEBUG,
- * rowsize of 16, groupsize of 1, and ASCII output included.
- */
-void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
- const void *buf, size_t len)
-{
- print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1,
- buf, len, true);
-}
-EXPORT_SYMBOL(print_hex_dump_bytes);
-#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */
#endif /* defined(CONFIG_PRINTK) */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f1e0569b4539..639d5e7014c1 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
head = compound_head(page);
v += (page - head) << PAGE_SHIFT;
- if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head))))
+ if (likely(n <= v && v <= (page_size(head))))
return true;
WARN_ON(1);
return false;
diff --git a/lib/math/rational.c b/lib/math/rational.c
index ba7443677c90..31fb27db2deb 100644
--- a/lib/math/rational.c
+++ b/lib/math/rational.c
@@ -3,6 +3,7 @@
* rational fractions
*
* Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
+ * Copyright (C) 2019 Trent Piepho <tpiepho@gmail.com>
*
* helper functions when coping with rational numbers
*/
@@ -10,6 +11,7 @@
#include <linux/rational.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/kernel.h>
/*
* calculate best rational approximation for a given fraction
@@ -33,30 +35,65 @@ void rational_best_approximation(
unsigned long max_numerator, unsigned long max_denominator,
unsigned long *best_numerator, unsigned long *best_denominator)
{
- unsigned long n, d, n0, d0, n1, d1;
+ /* n/d is the starting rational, which is continually
+ * decreased each iteration using the Euclidean algorithm.
+ *
+ * dp is the value of d from the prior iteration.
+ *
+ * n2/d2, n1/d1, and n0/d0 are our successively more accurate
+ * approximations of the rational. They are, respectively,
+ * the current, previous, and two prior iterations of it.
+ *
+ * a is current term of the continued fraction.
+ */
+ unsigned long n, d, n0, d0, n1, d1, n2, d2;
n = given_numerator;
d = given_denominator;
n0 = d1 = 0;
n1 = d0 = 1;
+
for (;;) {
- unsigned long t, a;
- if ((n1 > max_numerator) || (d1 > max_denominator)) {
- n1 = n0;
- d1 = d0;
- break;
- }
+ unsigned long dp, a;
+
if (d == 0)
break;
- t = d;
+ /* Find next term in continued fraction, 'a', via
+ * Euclidean algorithm.
+ */
+ dp = d;
a = n / d;
d = n % d;
- n = t;
- t = n0 + a * n1;
+ n = dp;
+
+ /* Calculate the current rational approximation (aka
+ * convergent), n2/d2, using the term just found and
+ * the two prior approximations.
+ */
+ n2 = n0 + a * n1;
+ d2 = d0 + a * d1;
+
+ /* If the current convergent exceeds the maxes, then
+ * return either the previous convergent or the
+ * largest semi-convergent, the final term of which is
+ * found below as 't'.
+ */
+ if ((n2 > max_numerator) || (d2 > max_denominator)) {
+ unsigned long t = min((max_numerator - n0) / n1,
+ (max_denominator - d0) / d1);
+
+ /* This tests if the semi-convergent is closer
+ * than the previous convergent.
+ */
+ if (2u * t > a || (2u * t == a && d0 * dp > d1 * d)) {
+ n1 = n0 + t * n1;
+ d1 = d0 + t * d1;
+ }
+ break;
+ }
n0 = n1;
- n1 = t;
- t = d0 + a * d1;
+ n1 = n2;
d0 = d1;
- d1 = t;
+ d1 = d2;
}
*best_numerator = n1;
*best_denominator = d1;
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 62b8ee92643d..41ae3c7570d3 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -77,26 +77,10 @@ static inline void erase_cached(struct test_node *node, struct rb_root_cached *r
}
-static inline u32 augment_recompute(struct test_node *node)
-{
- u32 max = node->val, child_augmented;
- if (node->rb.rb_left) {
- child_augmented = rb_entry(node->rb.rb_left, struct test_node,
- rb)->augmented;
- if (max < child_augmented)
- max = child_augmented;
- }
- if (node->rb.rb_right) {
- child_augmented = rb_entry(node->rb.rb_right, struct test_node,
- rb)->augmented;
- if (max < child_augmented)
- max = child_augmented;
- }
- return max;
-}
+#define NODE_VAL(node) ((node)->val)
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
- u32, augmented, augment_recompute)
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+ struct test_node, rb, u32, augmented, NODE_VAL)
static void insert_augmented(struct test_node *node,
struct rb_root_cached *root)
@@ -238,7 +222,20 @@ static void check_augmented(int nr_nodes)
check(nr_nodes);
for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
struct test_node *node = rb_entry(rb, struct test_node, rb);
- WARN_ON_ONCE(node->augmented != augment_recompute(node));
+ u32 subtree, max = node->val;
+ if (node->rb.rb_left) {
+ subtree = rb_entry(node->rb.rb_left, struct test_node,
+ rb)->augmented;
+ if (max < subtree)
+ max = subtree;
+ }
+ if (node->rb.rb_right) {
+ subtree = rb_entry(node->rb.rb_right, struct test_node,
+ rb)->augmented;
+ if (max < subtree)
+ max = subtree;
+ }
+ WARN_ON_ONCE(node->augmented != max);
}
}
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 5c86ef4c899f..1c26c14ffbb9 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -6,7 +6,6 @@
*/
#include <linux/mm.h>
-#include <linux/quicklist.h>
#include <linux/cma.h>
void show_mem(unsigned int filter, nodemask_t *nodemask)
@@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
#ifdef CONFIG_CMA
printk("%lu pages cma reserved\n", totalcma_pages);
#endif
-#ifdef CONFIG_QUICKLIST
- printk("%lu pages in pagetable cache\n",
- quicklist_total_size());
-#endif
#ifdef CONFIG_MEMORY_FAILURE
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
diff --git a/lib/string.c b/lib/string.c
index 461fb620f85f..cd7a10c19210 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -173,8 +173,9 @@ EXPORT_SYMBOL(strlcpy);
* doesn't unnecessarily force the tail of the destination buffer to be
* zeroed. If zeroing is desired please use strscpy_pad().
*
- * Return: The number of characters copied (not including the trailing
- * %NUL) or -E2BIG if the destination buffer wasn't big enough.
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if count is 0 or @src was truncated.
*/
ssize_t strscpy(char *dest, const char *src, size_t count)
{
@@ -182,7 +183,7 @@ ssize_t strscpy(char *dest, const char *src, size_t count)
size_t max = count;
long res = 0;
- if (count == 0)
+ if (count == 0 || WARN_ON_ONCE(count > INT_MAX))
return -E2BIG;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -253,8 +254,9 @@ EXPORT_SYMBOL(strscpy);
* For full explanation of why you may want to consider using the
* 'strscpy' functions please see the function docstring for strscpy().
*
- * Return: The number of characters copied (not including the trailing
- * %NUL) or -E2BIG if the destination buffer wasn't big enough.
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if count is 0 or @src was truncated.
*/
ssize_t strscpy_pad(char *dest, const char *src, size_t count)
{
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index b63b367a94e8..49cc4d570a40 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -18,6 +18,9 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
/*
* Note: test functions are marked noinline so that their names appear in
@@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void)
kfree(ptr2);
}
+static noinline void __init kfree_via_page(void)
+{
+ char *ptr;
+ size_t size = 8;
+ struct page *page;
+ unsigned long offset;
+
+ pr_info("invalid-free false positive (via page)\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ page = virt_to_page(ptr);
+ offset = offset_in_page(ptr);
+ kfree(page_address(page) + offset);
+}
+
+static noinline void __init kfree_via_phys(void)
+{
+ char *ptr;
+ size_t size = 8;
+ phys_addr_t phys;
+
+ pr_info("invalid-free false positive (via phys)\n");
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ phys = virt_to_phys(ptr);
+ kfree(phys_to_virt(phys));
+}
+
static noinline void __init kmem_cache_oob(void)
{
char *p;
@@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void)
kmalloc_uaf();
kmalloc_uaf_memset();
kmalloc_uaf2();
+ kfree_via_page();
+ kfree_via_phys();
kmem_cache_oob();
memcg_accounted_kmem_cache();
kasan_stack_oob();
diff --git a/mm/Kconfig b/mm/Kconfig
index 2fe4902ad755..a5dae9a7eb51 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -273,11 +273,6 @@ config BOUNCE
by default when ZONE_DMA or HIGHMEM is selected, but you
may say n to override this.
-config NR_QUICK
- int
- depends on QUICKLIST
- default "1"
-
config VIRT_TO_BUS
bool
help
@@ -717,6 +712,17 @@ config GUP_BENCHMARK
config GUP_GET_PTE_LOW_HIGH
bool
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
config ARCH_HAS_PTE_SPECIAL
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 82b6a20898bd..327b3ebf23bf 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC
Also, the state of page tracking structures is checked more often as
pages are being allocated and freed, as unexpected state changes
often happen for same reasons as memory corruption (e.g. double free,
- use-after-free).
+ use-after-free). The error reports for these checks can be augmented
+ with stack traces of last allocation and freeing of the page, when
+ PAGE_OWNER is also selected and enabled on boot.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/Makefile b/mm/Makefile
index d0b295c3b764..d996846697ef 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
@@ -72,7 +75,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 952dc2fb24e5..ce08b39d85d4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
- low_pfn += (1UL << compound_order(page)) - 1;
+ low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
}
@@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
- struct compact_control *cc)
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
@@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
*/
low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
- if (block_start_pfn < zone->zone_start_pfn)
- block_start_pfn = zone->zone_start_pfn;
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
/*
* fast_find_migrateblock marks a pageblock skipped so to avoid
@@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
- page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
- zone);
+ page = pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone);
if (!page)
continue;
@@ -2078,6 +2077,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -2158,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
cc->rescan = true;
}
- switch (isolate_migratepages(cc->zone, cc)) {
+ switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
@@ -2281,10 +2291,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
.search_order = order,
.gfp_mask = gfp_mask,
@@ -2305,8 +2311,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
if (capture)
current->capture_control = &capc;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(&cc, &capc);
@@ -2408,8 +2412,6 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -2423,11 +2425,7 @@ static void compact_node(int nid)
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
compact_zone(&cc, NULL);
@@ -2529,8 +2527,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
@@ -2554,16 +2550,10 @@ static void kcompactd_do_work(pg_data_t *pgdat)
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
+
+ cc.zone = zone;
status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 40667c2f3383..1146fcfa3215 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping,
/* hugetlb pages are represented by a single entry in the xarray */
if (!PageHuge(page)) {
xas_set_order(&xas, page->index, compound_order(page));
- nr = 1U << compound_order(page);
+ nr = compound_nr(page);
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -203,8 +203,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
- } else {
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
+ } else if (PageTransHuge(page)) {
+ __dec_node_page_state(page, NR_FILE_THPS);
+ filemap_nr_thps_dec(mapping);
}
/*
@@ -281,11 +282,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
@@ -294,40 +295,43 @@ static void page_cache_delete_batch(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
@@ -408,7 +412,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -617,10 +622,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- return (!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional);
+ if (dax_mapping(mapping))
+ return mapping->nrexceptional;
+
+ return mapping->nrpages;
}
int filemap_write_and_wait(struct address_space *mapping)
@@ -1516,7 +1524,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
@@ -1531,25 +1539,19 @@ repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1646,7 +1648,7 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
@@ -1731,7 +1733,6 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1742,17 +1743,13 @@ unsigned find_get_entries(struct address_space *mapping,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -1761,7 +1758,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1803,33 +1800,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1874,7 +1865,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1884,24 +1874,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1937,7 +1922,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1948,26 +1932,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -2562,12 +2541,12 @@ retry_find:
goto out_retry;
/* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
@@ -2648,7 +2627,7 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2657,24 +2636,19 @@ void filemap_map_pages(struct vm_fault *vmf,
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..60c3915c8ee6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,85 +29,70 @@ struct follow_page_context {
unsigned int page_mask;
};
-typedef int (*set_dirty_func_t)(struct page *page);
-
-static void __put_user_pages_dirty(struct page **pages,
- unsigned long npages,
- set_dirty_func_t sdf)
-{
- unsigned long index;
-
- for (index = 0; index < npages; index++) {
- struct page *page = compound_head(pages[index]);
-
- /*
- * Checking PageDirty at this point may race with
- * clear_page_dirty_for_io(), but that's OK. Two key cases:
- *
- * 1) This code sees the page as already dirty, so it skips
- * the call to sdf(). That could happen because
- * clear_page_dirty_for_io() called page_mkclean(),
- * followed by set_page_dirty(). However, now the page is
- * going to get written back, which meets the original
- * intention of setting it dirty, so all is well:
- * clear_page_dirty_for_io() goes on to call
- * TestClearPageDirty(), and write the page back.
- *
- * 2) This code sees the page as clean, so it calls sdf().
- * The page stays dirty, despite being written back, so it
- * gets written back again in the next writeback cycle.
- * This is harmless.
- */
- if (!PageDirty(page))
- sdf(page);
-
- put_user_page(page);
- }
-}
-
/**
- * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
+ * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
*
* "gup-pinned page" refers to a page that has had one of the get_user_pages()
* variants called on that page.
*
* For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using put_user_page(),
+ * possibly via put_user_pages(), for the non-dirty case.
*
* Please see the put_user_page() documentation for details.
*
- * set_page_dirty(), which does not lock the page, is used here.
- * Therefore, it is the caller's responsibility to ensure that this is
- * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), put_user_page().
*
*/
-void put_user_pages_dirty(struct page **pages, unsigned long npages)
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
{
- __put_user_pages_dirty(pages, npages, set_page_dirty);
-}
-EXPORT_SYMBOL(put_user_pages_dirty);
+ unsigned long index;
-/**
- * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
- * @pages: array of pages to be marked dirty and released.
- * @npages: number of pages in the @pages array.
- *
- * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
- *
- * Please see the put_user_page() documentation for details.
- *
- * This is just like put_user_pages_dirty(), except that it invokes
- * set_page_dirty_lock(), instead of set_page_dirty().
- *
- */
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
-{
- __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+
+ if (!make_dirty) {
+ put_user_pages(pages, npages);
+ return;
+ }
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key
+ * cases:
+ *
+ * 1) This code sees the page as already dirty, so it
+ * skips the call to set_page_dirty(). That could happen
+ * because clear_page_dirty_for_io() called
+ * page_mkclean(), followed by set_page_dirty().
+ * However, now the page is going to get written back,
+ * which meets the original intention of setting it
+ * dirty, so all is well: clear_page_dirty_for_io() goes
+ * on to call TestClearPageDirty(), and write the page
+ * back.
+ *
+ * 2) This code sees the page as clean, so it calls
+ * set_page_dirty(). The page stays dirty, despite being
+ * written back, so it gets written back again in the
+ * next writeback cycle. This is harmless.
+ */
+ if (!PageDirty(page))
+ set_page_dirty_lock(page);
+ put_user_page(page);
+ }
}
EXPORT_SYMBOL(put_user_pages_dirty_lock);
@@ -399,7 +384,7 @@ retry_locked:
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -408,7 +393,7 @@ retry_locked:
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
+ } else if (flags & FOLL_SPLIT) {
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
@@ -420,6 +405,10 @@ retry_locked:
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
}
return ret ? ERR_PTR(ret) :
@@ -1460,7 +1449,7 @@ check_again:
* gup may start from a tail page. Advance step by the left
* part.
*/
- step = (1 << compound_order(head)) - (pages[i] - head);
+ step = compound_nr(head) - (pages[i] - head);
/*
* If we get a page from the CMA zone, since we are going to
* be pinning these entries, we might as well move them out
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de1f15969e27..a7a9f90e9951 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- /* ->lru in the tail pages is occupied by compound_head. */
- return &page[2].deferred_list;
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
+}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
}
+#endif
void prep_transhuge_page(struct page *page)
{
@@ -511,6 +525,7 @@ void prep_transhuge_page(struct page *page)
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ page[2].nr_freeable = 0;
}
static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
@@ -2497,6 +2512,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct page *head = compound_head(page);
pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2504,6 +2521,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2513,6 +2538,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
}
}
@@ -2523,10 +2554,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
- if (PageSwapCache(head))
+ if (PageSwapCache(head)) {
page_ref_add(head, 2);
- else
+ xa_unlock(&swap_cache->i_pages);
+ } else {
page_ref_inc(head);
+ }
} else {
/* Additional pin to page cache */
page_ref_add(head, 2);
@@ -2673,6 +2706,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2759,17 +2793,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&pgdata->split_queue_lock);
+ spin_lock(&ds_queue->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
+ __mod_node_page_state(page_pgdat(page),
+ NR_KERNEL_MISC_RECLAIMABLE,
+ -head[2].nr_freeable);
+ head[2].nr_freeable = 0;
}
if (mapping)
__dec_node_page_state(page, NR_SHMEM_THPS);
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2786,7 +2824,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
@@ -2808,53 +2846,92 @@ out:
void free_transhuge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
unsigned long flags;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ __mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+ -page[2].nr_freeable);
+ page[2].nr_freeable = 0;
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
-void deferred_split_huge_page(struct page *page)
+void deferred_split_huge_page(struct page *page, unsigned int nr)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ page[2].nr_freeable += nr;
+ __mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+ nr);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
- pgdata->split_queue_len++;
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return READ_ONCE(pgdata->split_queue_len);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &pgdata->split_queue) {
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
@@ -2862,12 +2939,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -2881,15 +2958,15 @@ next:
put_page(page);
}
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
- list_splice_tail(&list, &pgdata->split_queue);
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
- if (!split && list_empty(&pgdata->split_queue))
+ if (!split && list_empty(&ds_queue->split_queue))
return SHRINK_STOP;
return split;
}
@@ -2898,7 +2975,8 @@ static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6d7296dd11b8..ef37c85423a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page)
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
+ bool alloc_try_hard = true;
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
return page;
}
@@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
@@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
- nid, nmask);
+ nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
@@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
if (page)
break;
}
@@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY]))
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
break;
cond_resched();
}
@@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
+
+ kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
@@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
+
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
spin_lock(&hugetlb_lock);
@@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
/* Fall through to decrease pool */
@@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2429,6 +2496,8 @@ out:
h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+
return 0;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f3c05b..f1930fa0b445 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
if (!page_hcg || page_hcg != h_cg)
goto out;
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..fb1e15028ef0 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -35,6 +35,6 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
- .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
+ .cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index e32390802fd3..0d5f720c75ab 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 95d16a42db6b..6814d6d6a023 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
const void *object)
{
- BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
return (void *)object + cache->kasan_info.alloc_meta_offset;
}
@@ -315,14 +314,31 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
return (void *)object + cache->kasan_info.free_meta_offset;
}
+
+static void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
void kasan_poison_slab(struct page *page)
{
unsigned long i;
- for (i = 0; i < (1 << compound_order(page)); i++)
+ for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << compound_order(page),
+ kasan_poison_shadow(page_address(page), page_size(page),
KASAN_KMALLOC_REDZONE);
}
@@ -452,7 +468,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
unlikely(!(cache->flags & SLAB_KASAN)))
return false;
- set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+ kasan_set_free_info(cache, object, tag);
+
quarantine_put(get_free_info(cache, object), cache);
return IS_ENABLED(CONFIG_KASAN_GENERIC);
@@ -524,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_SHADOW_SCALE_SIZE);
- redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+ redzone_end = (unsigned long)ptr + page_size(page);
kasan_unpoison_shadow(ptr, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
@@ -560,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
kasan_report_invalid_free(ptr, ip);
return;
}
- kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
- KASAN_FREE_PAGE);
+ kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
} else {
__kasan_slab_free(page->slab_cache, ptr, ip, false);
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 014f19e76247..35cff6bbb716 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -95,9 +95,19 @@ struct kasan_track {
depot_stack_handle_t stack;
};
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- struct kasan_track free_track;
+ struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+ u8 free_track_idx;
+#endif
};
struct qlist_node {
@@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
+struct page *kasan_addr_to_page(const void *addr);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 0e5f965f1882..621782100eaa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
}
}
-static struct page *addr_to_page(const void *addr)
+struct page *kasan_addr_to_page(const void *addr)
{
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory))
@@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
+static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
+
static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr)
+ const void *addr, u8 tag)
{
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
if (cache->flags & SLAB_KASAN) {
+ struct kasan_track *free_track;
+
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
- print_track(&alloc_info->free_track, "Freed");
+ free_track = kasan_get_free_track(cache, object, tag);
+ print_track(free_track, "Freed");
pr_err("\n");
}
@@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr)
print_decoded_frame_descr(frame_descr);
}
-static void print_address_description(void *addr)
+static void print_address_description(void *addr, u8 tag)
{
- struct page *page = addr_to_page(addr);
+ struct page *page = kasan_addr_to_page(addr);
dump_stack();
pr_err("\n");
@@ -355,7 +378,7 @@ static void print_address_description(void *addr)
struct kmem_cache *cache = page->slab_cache;
void *object = nearest_obj(cache, page, addr);
- describe_object(cache, object, addr);
+ describe_object(cache, object, addr, tag);
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
@@ -435,13 +458,14 @@ static bool report_enabled(void)
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
+ u8 tag = get_tag(object);
+ object = reset_tag(object);
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(get_tag(object), reset_tag(object));
- object = reset_tag(object);
+ print_tags(tag, object);
pr_err("\n");
- print_address_description(object);
+ print_address_description(object, tag);
pr_err("\n");
print_shadow_for_address(object);
end_report(&flags);
@@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
pr_err("\n");
if (addr_has_shadow(untagged_addr)) {
- print_address_description(untagged_addr);
+ print_address_description(untagged_addr, get_tag(tagged_addr));
pr_err("\n");
print_shadow_for_address(info.first_bad_addr);
} else {
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
index 8eaf5f722271..969ae08f59d7 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/tags_report.c
@@ -36,6 +36,30 @@
const char *get_bug_type(struct kasan_access_info *info)
{
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = get_alloc_info(cache, object);
+
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ return "out-of-bounds";
+ }
+
+#endif
return "invalid-access";
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..00cec6a127aa 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -48,6 +48,7 @@ enum scan_result {
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
+ SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
+#define MAX_PTE_MAPPED_THP 8
+
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
@@ -86,6 +89,10 @@ struct mm_slot {
struct hlist_node hash;
struct list_head mm_node;
struct mm_struct *mm;
+
+ /* pte-mapped THP in this mm */
+ int nr_pte_mapped_thp;
+ unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
@@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file)) {
+
+ if (shmem_file(vma->vm_file) ||
+ (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ vma->vm_file &&
+ (vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long hstart, hend;
/*
- * khugepaged does not yet work on non-shmem files or special
- * mappings. And file-private shmem THP is not supported.
+ * khugepaged only supports read-only files for non-shmem files.
+ * khugepaged does not yet work on special mappings. And
+ * file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
@@ -1248,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
}
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct mm_slot *mm_slot;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct page *hpage = NULL;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, _pmd;
+ spinlock_t *ptl;
+ int count = 0;
+ int i;
+
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ return;
+
+ /*
+ * This vm_flags may not have VM_HUGEPAGE if the page was not
+ * collapsed by this mm. But we can still collapse if the page is
+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+ * will not fail the vma for missing VM_HUGEPAGE
+ */
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ return;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd)
+ return;
+
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+ /* step 1: check all mapped PTEs are to the right huge page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ /* empty pte, skip */
+ if (pte_none(*pte))
+ continue;
+
+ /* page swapped out, abort */
+ if (!pte_present(*pte))
+ goto abort;
+
+ page = vm_normal_page(vma, addr, *pte);
+
+ if (!page || !PageCompound(page))
+ goto abort;
+
+ if (!hpage) {
+ hpage = compound_head(page);
+ /*
+ * The mapping of the THP should not change.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may
+ * change the page table, but the new page will
+ * not pass PageCompound() check.
+ */
+ if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+ goto abort;
+ }
+
+ /*
+ * Confirm the page maps to the correct subpage.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may change
+ * the page table, but the new page will not pass
+ * PageCompound() check.
+ */
+ if (WARN_ON(hpage + i != page))
+ goto abort;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ pte_unmap_unlock(start_pte, ptl);
+
+ /* step 3: set proper refcount and mm_counters. */
+ if (hpage) {
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ }
+
+ /* step 4: collapse pmd */
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ return;
+
+abort:
+ pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+ int i;
+
+ if (likely(mm_slot->nr_pte_mapped_thp == 0))
+ return 0;
+
+ if (!down_write_trylock(&mm->mmap_sem))
+ return -EBUSY;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+ mm_slot->nr_pte_mapped_thp = 0;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1256,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- /* probably overkill */
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth investing
+ * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * later.
+ *
+ * Not that vma->anon_vma check is racy: it can be set up after
+ * the check but before we took mmap_sem by the fault path.
+ * But page lock would prevent establishing any new ptes of the
+ * page, so we are safe.
+ *
+ * An alternative would be drop the check, but check that page
+ * table is clear before calling pmdp_collapse_flush() under
+ * ptl. It has higher chance to recover THP for the VMA, but
+ * has higher cost too.
+ */
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1269,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
/*
* We need exclusive mmap_sem to retract page table.
- * If trylock fails we would end up with pte-mapped THP after
- * re-fault. Not ideal, but it's more important to not disturb
- * the system too much.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
*/
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1281,18 +1462,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
up_write(&vma->vm_mm->mmap_sem);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ } else {
+ /* Try again later */
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
}
}
i_mmap_unlock_write(mapping);
}
/**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
- * + swap in pages if necessary;
+ * + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
@@ -1304,10 +1488,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_shmem(struct mm_struct *mm,
- struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
struct page **hpage, int node)
{
+ struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
struct mem_cgroup *memcg;
@@ -1315,7 +1500,9 @@ static void collapse_shmem(struct mm_struct *mm,
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
+ bool is_shmem = shmem_file(file);
+ VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
@@ -1347,7 +1534,8 @@ static void collapse_shmem(struct mm_struct *mm,
} while (1);
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (is_shmem)
+ __SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
@@ -1362,41 +1550,75 @@ static void collapse_shmem(struct mm_struct *mm,
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
- if (!page) {
- /*
- * Stop if extent has been truncated or hole-punched,
- * and is now completely empty.
- */
- if (index == start) {
- if (!xas_next_entry(&xas, end - 1)) {
- result = SCAN_TRUNCATED;
+ if (is_shmem) {
+ if (!page) {
+ /*
+ * Stop if extent has been truncated or
+ * hole-punched, and is now completely
+ * empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
goto xa_locked;
}
- xas_set(&xas, index);
+ xas_store(&xas, new_page);
+ nr_none++;
+ continue;
}
- if (!shmem_charge(mapping->host, 1)) {
- result = SCAN_FAIL;
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
- nr_none++;
- continue;
- }
-
- if (xa_is_value(page) || !PageUptodate(page)) {
- xas_unlock_irq(&xas);
- /* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
+ } else { /* !is_shmem */
+ if (!page || xa_is_value(page)) {
+ xas_unlock_irq(&xas);
+ page_cache_sync_readahead(mapping, &file->f_ra,
+ file, index,
+ PAGE_SIZE);
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
+ page = find_lock_page(mapping, index);
+ if (unlikely(page == NULL)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (!PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ wait_on_page_locked(page);
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto xa_unlocked;
+ }
+ get_page(page);
+ } else if (PageDirty(page)) {
result = SCAN_FAIL;
- goto xa_unlocked;
+ goto xa_locked;
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
}
- } else if (trylock_page(page)) {
- get_page(page);
- xas_unlock_irq(&xas);
- } else {
- result = SCAN_PAGE_LOCK;
- goto xa_locked;
}
/*
@@ -1425,6 +1647,12 @@ static void collapse_shmem(struct mm_struct *mm,
goto out_unlock;
}
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ goto out_unlock;
+ }
+
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
@@ -1454,7 +1682,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
@@ -1462,12 +1690,20 @@ out_unlock:
goto xa_unlocked;
}
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ if (is_shmem)
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ else {
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+ filemap_nr_thps_inc(mapping);
+ }
+
if (nr_none) {
struct zone *zone = page_zone(new_page);
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ if (is_shmem)
+ __mod_node_page_state(zone->zone_pgdat,
+ NR_SHMEM, nr_none);
}
xa_locked:
@@ -1505,10 +1741,15 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+
+ if (is_shmem) {
+ set_page_dirty(new_page);
+ lru_cache_add_anon(new_page);
+ } else {
+ lru_cache_add_file(new_page);
+ }
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
- lru_cache_add_anon(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1523,7 +1764,9 @@ xa_unlocked:
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
+
+ if (is_shmem)
+ shmem_uncharge(mapping->host, nr_none);
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
@@ -1563,11 +1806,11 @@ out:
/* TODO: tracepoints */
}
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
+ struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
@@ -1606,7 +1849,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
break;
}
- if (page_count(page) != 1 + page_mapcount(page)) {
+ if (page_count(page) !=
+ 1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
@@ -1631,19 +1875,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
- collapse_shmem(mm, mapping, start, hpage, node);
+ collapse_file(mm, file, start, hpage, node);
}
}
/* TODO: tracepoints */
}
#else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
BUILD_BUG();
}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ return 0;
+}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1668,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
+ khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = mm_slot->mm;
/*
@@ -1713,17 +1962,18 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (shmem_file(vma->vm_file)) {
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (!shmem_huge_enabled(vma))
+
+ if (shmem_file(vma->vm_file)
+ && !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
- khugepaged_scan_shmem(mm, file->f_mapping,
- pgoff, hpage);
+ khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f6e602918dac..1f74f8bcb4eb 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -168,6 +168,8 @@ struct kmemleak_object {
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN (1 << 3)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
@@ -183,6 +185,10 @@ struct kmemleak_object {
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
@@ -193,13 +199,11 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled;
+static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
-/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
@@ -227,49 +231,6 @@ static bool kmemleak_found_leaks;
static bool kmemleak_verbose;
module_param_named(verbose, kmemleak_verbose, bool, 0600);
-/*
- * Early object allocation/freeing logging. Kmemleak is initialized after the
- * kernel allocator. However, both the kernel allocator and kmemleak may
- * allocate memory blocks which need to be tracked. Kmemleak defines an
- * arbitrary buffer to hold the allocation/freeing information before it is
- * fully initialized.
- */
-
-/* kmemleak operation type for early logging */
-enum {
- KMEMLEAK_ALLOC,
- KMEMLEAK_ALLOC_PERCPU,
- KMEMLEAK_FREE,
- KMEMLEAK_FREE_PART,
- KMEMLEAK_FREE_PERCPU,
- KMEMLEAK_NOT_LEAK,
- KMEMLEAK_IGNORE,
- KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN,
- KMEMLEAK_SET_EXCESS_REF
-};
-
-/*
- * Structure holding the information passed to kmemleak callbacks during the
- * early logging.
- */
-struct early_log {
- int op_type; /* kmemleak operation type */
- int min_count; /* minimum reference count */
- const void *ptr; /* allocated/freed memory block */
- union {
- size_t size; /* memory block size */
- unsigned long excess_ref; /* surplus reference passing */
- };
- unsigned long trace[MAX_TRACE]; /* stack trace */
- unsigned int trace_len; /* stack trace length */
-};
-
-/* early logging buffer and current position */
-static struct early_log
- early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
-static int crt_early_log __initdata;
-
static void kmemleak_disable(void);
/*
@@ -450,6 +411,54 @@ static int get_object(struct kmemleak_object *object)
}
/*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ /* try the slab allocator first */
+ if (object_cache) {
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (object)
+ return object;
+ }
+
+ /* slab allocation failed, try the memory pool */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ object = list_first_entry_or_null(&mem_pool_free_list,
+ typeof(*object), object_list);
+ if (object)
+ list_del(&object->object_list);
+ else if (mem_pool_free_count)
+ object = &mem_pool[--mem_pool_free_count];
+ else
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+ kmem_cache_free(object_cache, object);
+ return;
+ }
+
+ /* add the object to the memory pool free list */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ list_add(&object->object_list, &mem_pool_free_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
* RCU callback to free a kmemleak_object.
*/
static void free_object_rcu(struct rcu_head *rcu)
@@ -467,7 +476,7 @@ static void free_object_rcu(struct rcu_head *rcu)
hlist_del(&area->node);
kmem_cache_free(scan_area_cache, area);
}
- kmem_cache_free(object_cache, object);
+ mem_pool_free(object);
}
/*
@@ -485,7 +494,15 @@ static void put_object(struct kmemleak_object *object)
/* should only get here after delete_object was called */
WARN_ON(object->flags & OBJECT_ALLOCATED);
- call_rcu(&object->rcu, free_object_rcu);
+ /*
+ * It may be too early for the RCU callbacks, however, there is no
+ * concurrent object_list traversal when !object_cache and all objects
+ * came from the memory pool. Free the object directly.
+ */
+ if (object_cache)
+ call_rcu(&object->rcu, free_object_rcu);
+ else
+ free_object_rcu(&object->rcu);
}
/*
@@ -550,7 +567,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
- object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
@@ -689,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
/*
* Create one or two objects that may result from the memory block
* split. Note that partial freeing is only done by free_bootmem() and
- * this happens before kmemleak_init() is called. The path below is
- * only executed during early log recording in kmemleak_init(), so
- * GFP_KERNEL is enough.
+ * this happens before kmemleak_init() is called.
*/
start = object->pointer;
end = object->pointer + object->size;
@@ -763,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
{
unsigned long flags;
struct kmemleak_object *object;
- struct kmemleak_scan_area *area;
+ struct kmemleak_scan_area *area = NULL;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -772,13 +787,16 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- if (!area) {
- pr_warn("Cannot allocate a scan area\n");
- goto out;
- }
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
spin_lock_irqsave(&object->lock, flags);
+ if (!area) {
+ pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+ /* mark the object for full scan to avoid false positives */
+ object->flags |= OBJECT_FULL_SCAN;
+ goto out_unlock;
+ }
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
@@ -795,7 +813,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
-out:
put_object(object);
}
@@ -845,86 +862,6 @@ static void object_no_scan(unsigned long ptr)
put_object(object);
}
-/*
- * Log an early kmemleak_* call to the early_log buffer. These calls will be
- * processed later once kmemleak is fully initialized.
- */
-static void __init log_early(int op_type, const void *ptr, size_t size,
- int min_count)
-{
- unsigned long flags;
- struct early_log *log;
-
- if (kmemleak_error) {
- /* kmemleak stopped recording, just count the requests */
- crt_early_log++;
- return;
- }
-
- if (crt_early_log >= ARRAY_SIZE(early_log)) {
- crt_early_log++;
- kmemleak_disable();
- return;
- }
-
- /*
- * There is no need for locking since the kernel is still in UP mode
- * at this stage. Disabling the IRQs is enough.
- */
- local_irq_save(flags);
- log = &early_log[crt_early_log];
- log->op_type = op_type;
- log->ptr = ptr;
- log->size = size;
- log->min_count = min_count;
- log->trace_len = __save_stack_trace(log->trace);
- crt_early_log++;
- local_irq_restore(flags);
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc(struct early_log *log)
-{
- struct kmemleak_object *object;
- unsigned long flags;
- int i;
-
- if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
- return;
-
- /*
- * RCU locking needed to ensure object is not freed via put_object().
- */
- rcu_read_lock();
- object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_ATOMIC);
- if (!object)
- goto out;
- spin_lock_irqsave(&object->lock, flags);
- for (i = 0; i < log->trace_len; i++)
- object->trace[i] = log->trace[i];
- object->trace_len = log->trace_len;
- spin_unlock_irqrestore(&object->lock, flags);
-out:
- rcu_read_unlock();
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc_percpu(struct early_log *log)
-{
- unsigned int cpu;
- const void __percpu *ptr = log->ptr;
-
- for_each_possible_cpu(cpu) {
- log->ptr = per_cpu_ptr(ptr, cpu);
- early_alloc(log);
- }
-}
-
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
@@ -946,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -975,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1001,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp
create_object((unsigned long)area->addr, size, 2, gfp);
object_set_excess_ref((unsigned long)area,
(unsigned long)area->addr);
- } else if (kmemleak_early_log) {
- log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
- /* reusing early_log.size for storing area->addr */
- log_early(KMEMLEAK_SET_EXCESS_REF,
- area, (unsigned long)area->addr, 0);
}
}
EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
@@ -1023,8 +951,6 @@ void __ref kmemleak_free(const void *ptr)
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -1043,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -1065,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
@@ -1117,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -1137,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1159,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -1179,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
@@ -1408,7 +1322,8 @@ static void scan_object(struct kmemleak_object *object)
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
- if (hlist_empty(&object->area_list)) {
+ if (hlist_empty(&object->area_list) ||
+ object->flags & OBJECT_FULL_SCAN) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
void *next;
@@ -1966,7 +1881,6 @@ static void kmemleak_disable(void)
/* stop any memory operation tracing */
kmemleak_enabled = 0;
- kmemleak_early_log = 0;
/* check whether it is too early for a kernel thread */
if (kmemleak_initialized)
@@ -1994,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str)
}
early_param("kmemleak", kmemleak_boot_config);
-static void __init print_log_trace(struct early_log *log)
-{
- pr_notice("Early log backtrace:\n");
- stack_trace_print(log->trace, log->trace_len, 2);
-}
-
/*
* Kmemleak initialization.
*/
void __init kmemleak_init(void)
{
- int i;
- unsigned long flags;
-
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
kmemleak_disable();
@@ -2015,28 +1920,15 @@ void __init kmemleak_init(void)
}
#endif
+ if (kmemleak_error)
+ return;
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
- crt_early_log);
-
- /* the kernel is still in UP mode, so disabling the IRQs is enough */
- local_irq_save(flags);
- kmemleak_early_log = 0;
- if (kmemleak_error) {
- local_irq_restore(flags);
- return;
- } else {
- kmemleak_enabled = 1;
- kmemleak_free_enabled = 1;
- }
- local_irq_restore(flags);
-
/* register the data/bss sections */
create_object((unsigned long)_sdata, _edata - _sdata,
KMEMLEAK_GREY, GFP_ATOMIC);
@@ -2047,57 +1939,6 @@ void __init kmemleak_init(void)
create_object((unsigned long)__start_ro_after_init,
__end_ro_after_init - __start_ro_after_init,
KMEMLEAK_GREY, GFP_ATOMIC);
-
- /*
- * This is the point where tracking allocations is safe. Automatic
- * scanning is started during the late initcall. Add the early logged
- * callbacks to the kmemleak infrastructure.
- */
- for (i = 0; i < crt_early_log; i++) {
- struct early_log *log = &early_log[i];
-
- switch (log->op_type) {
- case KMEMLEAK_ALLOC:
- early_alloc(log);
- break;
- case KMEMLEAK_ALLOC_PERCPU:
- early_alloc_percpu(log);
- break;
- case KMEMLEAK_FREE:
- kmemleak_free(log->ptr);
- break;
- case KMEMLEAK_FREE_PART:
- kmemleak_free_part(log->ptr, log->size);
- break;
- case KMEMLEAK_FREE_PERCPU:
- kmemleak_free_percpu(log->ptr);
- break;
- case KMEMLEAK_NOT_LEAK:
- kmemleak_not_leak(log->ptr);
- break;
- case KMEMLEAK_IGNORE:
- kmemleak_ignore(log->ptr);
- break;
- case KMEMLEAK_SCAN_AREA:
- kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
- break;
- case KMEMLEAK_NO_SCAN:
- kmemleak_no_scan(log->ptr);
- break;
- case KMEMLEAK_SET_EXCESS_REF:
- object_set_excess_ref((unsigned long)log->ptr,
- log->excess_ref);
- break;
- default:
- kmemleak_warn("Unknown early log operation: %d\n",
- log->op_type);
- }
-
- if (kmemleak_warning) {
- print_log_trace(log);
- kmemleak_warning = 0;
- }
- }
}
/*
@@ -2126,7 +1967,8 @@ static int __init kmemleak_late_init(void)
mutex_unlock(&scan_mutex);
}
- pr_info("Kernel memory leak detector initialized\n");
+ pr_info("Kernel memory leak detector initialized (mem pool size: %d)\n",
+ mem_pool_free_count);
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346411e4..dbee2eb4dd05 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page)
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index afe2b015ea58..7ec7c8f6d5ab 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
@@ -30,6 +31,11 @@
#include "internal.h"
+struct madvise_walk_private {
+ struct mmu_gather *tlb;
+ bool pageout;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -41,6 +47,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
case MADV_FREE:
return 0;
default:
@@ -106,28 +114,14 @@ static long madvise_behavior(struct vm_area_struct *vma,
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
}
@@ -153,15 +147,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
goto out;
}
error = __split_vma(mm, vma, start, 1);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
if (end != vma->vm_end) {
@@ -170,15 +157,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
goto out;
}
error = __split_vma(mm, vma, end, 0);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
success:
@@ -186,6 +166,14 @@ success:
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
+
+out_convert_errno:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
out:
return error;
}
@@ -299,6 +287,262 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct madvise_walk_private *private = walk->private;
+ struct mmu_gather *tlb = private->tlb;
+ bool pageout = private->pageout;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page = NULL;
+ LIST_HEAD(page_list);
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+ } else
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+regular_page:
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page))
+ list_add(&page->lru, &page_list);
+ } else
+ deactivate_page(page);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ cond_resched();
+
+ return 0;
+}
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .tlb = tlb,
+ .pageout = false,
+ };
+
+ struct mm_walk cold_walk = {
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
+ .mm = vma->vm_mm,
+ .private = &walk_private,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &cold_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = true,
+ .tlb = tlb,
+ };
+
+ struct mm_walk pageout_walk = {
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
+ .mm = vma->vm_mm,
+ .private = &walk_private,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &pageout_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!can_do_pageout(vma))
+ return 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -503,7 +747,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
int behavior)
{
*prev = vma;
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -525,7 +769,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -679,6 +923,10 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -700,6 +948,8 @@ madvise_behavior_valid(int behavior)
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6545aa4bbc0..d40ce6ee7263 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -756,15 +750,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup_per_node *pi;
- /*
- * Batch local counters to keep them in sync with
- * the hierarchical ones.
- */
- __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
@@ -1575,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -2272,21 +2269,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
@@ -2361,11 +2359,67 @@ static void high_work_func(struct work_struct *work)
}
/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long usage, high, clamped_high;
+ unsigned long pflags;
+ unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
@@ -2374,8 +2428,75 @@ void mem_cgroup_handle_over_high(void)
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ *
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ goto out;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if it was a
+ * threshold of 1 page
+ */
+ clamped_high = max(high, 1UL);
+
+ overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+ clamped_high);
+
+ penalty_jiffies = ((u64)overage * overage * HZ)
+ >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -3264,37 +3385,49 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
{
unsigned long stat[MEMCG_NR_STAT];
struct mem_cgroup *mi;
int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
- for (i = 0; i < MEMCG_NR_STAT; i++)
+ for (i = min_idx; i < max_idx; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = 0; i < MEMCG_NR_STAT; i++)
+ for (i = min_idx; i < max_idx; i++)
stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = 0; i < MEMCG_NR_STAT; i++)
+ for (i = min_idx; i < max_idx; i++)
atomic_long_add(stat[i], &mi->vmstats[i]);
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
for_each_node(node) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
struct mem_cgroup_per_node *pi;
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = min_idx; i < max_idx; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = min_idx; i < max_idx; i++)
stat[i] += raw_cpu_read(
pn->lruvec_stat_cpu->count[i]);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = min_idx; i < max_idx; i++)
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
}
}
@@ -3367,7 +3500,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
+ /*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -4865,7 +5005,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmstats(memcg, false);
memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
@@ -4933,6 +5073,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
+#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
fail:
@@ -5311,6 +5456,14 @@ static int mem_cgroup_move_account(struct page *page,
__mod_memcg_state(to, NR_WRITEBACK, nr_pages);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && !list_empty(page_deferred_list(page))) {
+ spin_lock(&from->deferred_split_queue.split_queue_lock);
+ list_del_init(page_deferred_list(page));
+ from->deferred_split_queue.split_queue_len--;
+ spin_unlock(&from->deferred_split_queue.split_queue_lock);
+ }
+#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -5319,6 +5472,17 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && list_empty(page_deferred_list(page))) {
+ spin_lock(&to->deferred_split_queue.split_queue_lock);
+ list_add_tail(page_deferred_list(page),
+ &to->deferred_split_queue.split_queue);
+ to->deferred_split_queue.split_queue_len++;
+ spin_unlock(&to->deferred_split_queue.split_queue_lock);
+ }
+#endif
+
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -6489,7 +6653,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
unsigned int nr_pages = 1;
if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
+ nr_pages = compound_nr(page);
ug->nr_huge += nr_pages;
}
if (PageAnon(page))
@@ -6501,7 +6665,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
}
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += compound_nr(page);
__ClearPageKmemcg(page);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
diff --git a/mm/memory.c b/mm/memory.c
index b1dff75640b7..e0c232fe81d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1007,6 +1007,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
+ int progress = 0;
int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
@@ -1022,7 +1023,15 @@ again:
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = *pte;
+ pte_t ptent;
+
+ if (progress++ >= 32) {
+ progress = 0;
+ if (need_resched())
+ break;
+ }
+
+ ptent = *pte;
if (pte_none(ptent))
continue;
@@ -1093,7 +1102,6 @@ again:
if (unlikely(details))
continue;
- entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry))
rss[MM_SWAPENTS]--;
else if (is_migration_entry(entry)) {
@@ -1124,8 +1132,11 @@ again:
if (force_flush) {
force_flush = 0;
tlb_flush_mmu(tlb);
- if (addr != end)
- goto again;
+ }
+
+ if (addr != end) {
+ progress = 0;
+ goto again;
}
return addr;
@@ -2877,7 +2888,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
@@ -2891,6 +2901,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c73f09913165..49f7bf91c25a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -632,33 +632,30 @@ static void generic_online_page(struct page *page, unsigned int order)
#endif
}
-static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
-{
- unsigned long end = start + nr_pages;
- int order, onlined_pages = 0;
-
- while (start < end) {
- order = min(MAX_ORDER - 1,
- get_order(PFN_PHYS(end) - PFN_PHYS(start)));
- (*online_page_callback)(pfn_to_page(start), order);
-
- onlined_pages += (1UL << order);
- start += (1UL << order);
- }
- return onlined_pages;
-}
-
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long onlined_pages = *(unsigned long *)arg;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ int order;
- if (PageReserved(pfn_to_page(start_pfn)))
- onlined_pages += online_pages_blocks(start_pfn, nr_pages);
+ /*
+ * Online the pages. The callback might decide to keep some pages
+ * PG_reserved (to add them to the buddy later), but we still account
+ * them as being online/belonging to this zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+ order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+ /* __free_pages_core() wants pfns to be aligned to the order */
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+ order = 0;
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ }
- online_mem_sections(start_pfn, start_pfn + nr_pages);
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += nr_pages;
return 0;
}
@@ -714,8 +711,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
@@ -804,20 +806,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- struct zone *zone;
-
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
- return zone;
-}
-
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -840,7 +828,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
put_device(&mem->dev);
/* associate pfn range with the zone */
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -864,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ /* not a single memory resource was applicable */
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
goto failed_addition;
@@ -877,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
shuffle_zone(zone);
- if (onlined_pages) {
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
- }
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ else
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
- if (onlined_pages) {
- kswapd_run(nid);
- kcompactd_run(nid);
- }
+ kswapd_run(nid);
+ kcompactd_run(nid);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
- if (onlined_pages)
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &arg);
mem_hotplug_done();
return 0;
@@ -933,8 +918,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
if (!pgdat)
return NULL;
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
+ int cpu;
/*
* Reset the nr_zones, order and classzone_idx before reuse.
* Note that kswapd will init kswapd_classzone_idx properly
@@ -943,6 +931,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
}
/* we can use NODE_DATA(nid) from here */
@@ -952,7 +946,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1309,7 +1302,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
head = compound_head(page);
if (page_huge_active(head))
return pfn;
- skip = (1 << compound_order(head)) - (page - head);
+ skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
return 0;
@@ -1347,7 +1340,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
@@ -1662,7 +1655,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ endpa = beginpa + memory_block_size_bytes() - 1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f000771558d8..464406e8da91 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1512,10 +1512,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
if (nodes_empty(*new))
goto out_put;
- nodes_and(*new, *new, node_states[N_MEMORY]);
- if (nodes_empty(*new))
- goto out_put;
-
err = security_task_movememory(task);
if (err)
goto out_put;
diff --git a/mm/memremap.c b/mm/memremap.c
index 32c79b51af86..f6c17339cd0d 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -75,6 +75,26 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
diff --git a/mm/migrate.c b/mm/migrate.c
index 9f4ed4e985c1..73d476d690b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
@@ -1892,7 +1892,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
if (isolate_lru_page(page))
@@ -2218,17 +2218,15 @@ again:
pte_t pte;
pte = *ptep;
- pfn = pte_pfn(pte);
if (pte_none(pte)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
if (!pte_present(pte)) {
- mpfn = pfn = 0;
+ mpfn = 0;
/*
* Only care about unaddressable device page special
@@ -2245,10 +2243,10 @@ again:
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
@@ -2258,10 +2256,9 @@ again:
/* FIXME support THP */
if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = pfn = 0;
+ mpfn = 0;
goto next;
}
- pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
diff --git a/mm/mmap.c b/mm/mmap.c
index 6bc21fca20bc..1bd66e38485f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -289,9 +289,9 @@ out:
return retval;
}
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
- unsigned long max, prev_end, subtree_gap;
+ unsigned long gap, prev_end;
/*
* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
@@ -299,14 +299,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
* an unmapped area; whereas when expanding we only require one.
* That's a little inconsistent, but keeps the code here simpler.
*/
- max = vm_start_gap(vma);
+ gap = vm_start_gap(vma);
if (vma->vm_prev) {
prev_end = vm_end_gap(vma->vm_prev);
- if (max > prev_end)
- max -= prev_end;
+ if (gap > prev_end)
+ gap -= prev_end;
else
- max = 0;
+ gap = 0;
}
+ return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -322,7 +329,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
return max;
}
-#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
@@ -428,8 +434,9 @@ static void validate_mm(struct mm_struct *mm)
#define validate_mm(mm) do { } while (0)
#endif
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+ struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_gap)
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@@ -439,8 +446,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
- * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exactly what we want.
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+ * a callback function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
@@ -1358,6 +1365,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
@@ -1870,6 +1880,22 @@ unacct_error:
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1884,10 +1910,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1919,6 +1942,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1947,6 +1971,7 @@ check_current:
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1956,17 +1981,17 @@ check_current:
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1979,16 +2004,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2025,6 +2048,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2059,13 +2083,14 @@ check_current:
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
@@ -2274,12 +2299,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
if (vma) {
*pprev = vma->vm_prev;
} else {
- struct rb_node *rb_node = mm->mm_rb.rb_node;
- *pprev = NULL;
- while (rb_node) {
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- rb_node = rb_node->rb_right;
- }
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 8c943a6e1696..7d70e5c78f97 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
- /* keep the page table cache within bounds */
- check_pgt_cache();
#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
diff --git a/mm/nommu.c b/mm/nommu.c
index fed1b6e9c89b..99b7ec318824 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp)
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
/**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a0bdc6..93eae768a475 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -377,36 +377,13 @@ static void select_bad_process(struct oom_control *oc)
}
}
-static int dump_task(struct task_struct *p, void *arg)
-{
- struct oom_control *oc = arg;
- struct task_struct *task;
-
- if (oom_unkillable_task(p))
- return 0;
-
- /* p may not have freeable memory in nodemask */
- if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
- return 0;
- task = find_lock_task_mm(p);
- if (!task) {
- /*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
- */
- return 0;
+static int add_candidate_task(struct task_struct *p, void *arg)
+{
+ if (!oom_unkillable_task(p)) {
+ get_task_struct(p);
+ list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
}
-
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
- task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- mm_pgtables_bytes(task->mm),
- get_mm_counter(task->mm, MM_SWAPENTS),
- task->signal->oom_score_adj, task->comm);
- task_unlock(task);
-
return 0;
}
@@ -422,19 +399,41 @@ static int dump_task(struct task_struct *p, void *arg)
*/
static void dump_tasks(struct oom_control *oc)
{
- pr_info("Tasks state (memory values in pages):\n");
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ LIST_HEAD(list);
+ struct task_struct *p;
+ struct task_struct *t;
if (is_memcg_oom(oc))
- mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+ mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
else {
- struct task_struct *p;
-
rcu_read_lock();
for_each_process(p)
- dump_task(p, oc);
+ add_candidate_task(p, &list);
rcu_read_unlock();
}
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ list_for_each_entry(p, &list, oom_victim_list) {
+ cond_resched();
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ continue;
+ /* All of p's threads might have already detached their mm's. */
+ t = find_lock_task_mm(p);
+ if (!t)
+ continue;
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ t->pid, from_kuid(&init_user_ns, task_uid(t)),
+ t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
+ mm_pgtables_bytes(t->mm),
+ get_mm_counter(t->mm, MM_SWAPENTS),
+ t->signal->oom_score_adj, t->comm);
+ task_unlock(t);
+ }
+ list_for_each_entry_safe(p, t, &list, oom_victim_list) {
+ list_del(&p->oom_victim_list);
+ put_task_struct(p);
+ }
}
static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -523,7 +522,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
@@ -884,12 +883,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- message, task_pid_nr(victim), victim->comm,
- K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
task_unlock(victim);
/*
@@ -1068,9 +1068,10 @@ bool out_of_memory(struct oom_control *oc)
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4371f71ef3f2..c5d62f1c2851 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -670,6 +670,7 @@ out:
void free_compound_page(struct page *page)
{
+ mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page));
}
@@ -3955,14 +3956,22 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
goto check_priority;
/*
+ * compaction was skipped because there are not enough order-0 pages
+ * to work with, so we retry only if it looks like reclaim can help.
+ */
+ if (compaction_needs_reclaim(compact_result)) {
+ ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+ goto out;
+ }
+
+ /*
* make sure the compaction wasn't deferred or didn't bail out early
* due to locks contention before we declare that we should give up.
- * But do not retry if the given zonelist is not suitable for
- * compaction.
+ * But the next retry should use a higher priority if allowed, so
+ * we don't just keep bailing out endlessly.
*/
if (compaction_withdrawn(compact_result)) {
- ret = compaction_zonelist_suitable(ac, order, alloc_flags);
- goto out;
+ goto check_priority;
}
/*
@@ -6638,9 +6647,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
- spin_lock_init(&pgdat->split_queue_lock);
- INIT_LIST_HEAD(&pgdat->split_queue);
- pgdat->split_queue_len = 0;
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@ -8196,7 +8207,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- skip_pages = (1 << compound_order(head)) - (page - head);
+ skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
continue;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index addcbb2ae4e4..dee931184788 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -24,6 +24,9 @@ struct page_owner {
short last_migrate_reason;
gfp_t gfp_mask;
depot_stack_handle_t handle;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ depot_stack_handle_t free_handle;
+#endif
};
static bool page_owner_disabled = true;
@@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
return (void *)page_ext + page_owner_ops.offset;
}
-void __reset_page_owner(struct page *page, unsigned int order)
-{
- int i;
- struct page_ext *page_ext;
-
- for (i = 0; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
- if (unlikely(!page_ext))
- continue;
- __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
- }
-}
-
static inline bool check_recursive_alloc(unsigned long *entries,
unsigned int nr_entries,
unsigned long ip)
@@ -154,18 +144,50 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
return handle;
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
+void __reset_page_owner(struct page *page, unsigned int order)
{
+ int i;
+ struct page_ext *page_ext;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ depot_stack_handle_t handle = 0;
struct page_owner *page_owner;
- page_owner = get_page_owner(page_ext);
- page_owner->handle = handle;
- page_owner->order = order;
- page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
+ if (debug_pagealloc_enabled())
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+#endif
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ for (i = 0; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ if (unlikely(!page_ext))
+ continue;
+ __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (debug_pagealloc_enabled()) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ }
+#endif
+ }
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+ struct page_ext *page_ext, depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
+{
+ struct page_owner *page_owner;
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+
+ page_ext = lookup_page_ext(page + i);
+ }
}
noinline void __set_page_owner(struct page *page, unsigned int order,
@@ -178,7 +200,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -204,8 +226,11 @@ void __split_page_owner(struct page *page, unsigned int order)
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
- for (i = 1; i < (1 << order); i++)
- __copy_page_owner(page, page + i);
+ for (i = 1; i < (1 << order); i++) {
+ page_ext = lookup_page_ext(page + i);
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
+ }
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
@@ -235,6 +260,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -294,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (unlikely(!page_ext))
continue;
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
@@ -405,20 +431,36 @@ void __dump_page_owner(struct page *page)
mt = gfpflags_to_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
- pr_alert("page_owner info is not active (free page?)\n");
+ pr_alert("page_owner info is not present (never set?)\n");
return;
}
+ if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ pr_alert("page_owner tracks the page as allocated\n");
+ else
+ pr_alert("page_owner tracks the page as freed\n");
+
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
handle = READ_ONCE(page_owner->handle);
if (!handle) {
- pr_alert("page_owner info is not active (free page?)\n");
- return;
+ pr_alert("page_owner allocation stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
}
- nr_entries = stack_depot_fetch(handle, &entries);
- pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
- stack_trace_print(entries, nr_entries, 0);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ handle = READ_ONCE(page_owner->free_handle);
+ if (!handle) {
+ pr_alert("page_owner free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ pr_alert("page last free stack trace:\n");
+ stack_trace_print(entries, nr_entries, 0);
+ }
+#endif
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -481,9 +523,23 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Although we do have the info about past allocation of free
+ * pages, it's not relevant for current memory usage.
+ */
+ if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+ continue;
+
page_owner = get_page_owner(page_ext);
/*
+ * Don't print "tail" pages of high-order allocations as that
+ * would inflate the stats.
+ */
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+ continue;
+
+ /*
* Access to page_ext->handle isn't synchronous so we should
* be careful to access it.
*/
@@ -562,7 +618,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle, 0, 0);
+ __set_page_owner_handle(page, page_ext, early_handle,
+ 0, 0);
count++;
}
cond_resched();
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 21d4f97cb49b..34b9181ee5d1 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -101,7 +101,7 @@ static void unpoison_page(struct page *page)
/*
* Page poisoning when enabled poisons each and every page
* that is freed to buddy. Thus no extra check is done to
- * see if a page was posioned.
+ * see if a page was poisoned.
*/
check_poison_mem(addr, PAGE_SIZE);
kunmap_atomic(addr);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e71288..eff4b4520c8d 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address,
- PAGE_SIZE << compound_order(page));
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
return false;
diff --git a/mm/quicklist.c b/mm/quicklist.c
deleted file mode 100644
index 5e98ac78e410..000000000000
--- a/mm/quicklist.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Quicklist support.
- *
- * Quicklists are light weight lists of pages that have a defined state
- * on alloc and free. Pages must be in the quicklist specific defined state
- * (zero by default) when the page is freed. It seems that the initial idea
- * for such lists first came from Dave Miller and then various other people
- * improved on it.
- *
- * Copyright (C) 2007 SGI,
- * Christoph Lameter <cl@linux.com>
- * Generalized, added support for multiple lists and
- * constructors / destructors.
- */
-#include <linux/kernel.h>
-
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/quicklist.h>
-
-DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
-
-#define FRACTION_OF_NODE_MEM 16
-
-static unsigned long max_pages(unsigned long min_pages)
-{
- unsigned long node_free_pages, max;
- int node = numa_node_id();
- struct zone *zones = NODE_DATA(node)->node_zones;
- int num_cpus_on_node;
-
- node_free_pages =
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
-
- max = node_free_pages / FRACTION_OF_NODE_MEM;
-
- num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
- max /= num_cpus_on_node;
-
- return max(max, min_pages);
-}
-
-static long min_pages_to_free(struct quicklist *q,
- unsigned long min_pages, long max_free)
-{
- long pages_to_free;
-
- pages_to_free = q->nr_pages - max_pages(min_pages);
-
- return min(pages_to_free, max_free);
-}
-
-/*
- * Trim down the number of pages in the quicklist
- */
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free)
-{
- long pages_to_free;
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- if (q->nr_pages > min_pages) {
- pages_to_free = min_pages_to_free(q, min_pages, max_free);
-
- while (pages_to_free > 0) {
- /*
- * We pass a gfp_t of 0 to quicklist_alloc here
- * because we will never call into the page allocator.
- */
- void *p = quicklist_alloc(nr, 0, NULL);
-
- if (dtor)
- dtor(p);
- free_page((unsigned long)p);
- pages_to_free--;
- }
- }
- put_cpu_var(quicklist);
-}
-
-unsigned long quicklist_total_size(void)
-{
- unsigned long count = 0;
- int cpu;
- struct quicklist *ql, *q;
-
- for_each_online_cpu(cpu) {
- ql = per_cpu(quicklist, cpu);
- for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
- count += q->nr_pages;
- }
- return count;
-}
-
diff --git a/mm/rmap.c b/mm/rmap.c
index 003377e24232..c1c4108ef2d3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -898,15 +898,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- unsigned long cstart;
int ret = 0;
- cstart = address = pvmw.address;
+ address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -933,7 +931,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
- cstart &= PMD_MASK;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -1192,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound)
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1232,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
@@ -1286,7 +1287,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (nr) {
__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
- deferred_split_huge_page(page);
+ deferred_split_huge_page(page, nr);
}
}
@@ -1320,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound)
clear_page_mlock(page);
if (PageTransCompound(page))
- deferred_split_huge_page(compound_head(page));
+ deferred_split_huge_page(compound_head(page), 1);
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -1374,8 +1375,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address,
- min(vma->vm_end, address +
- (PAGE_SIZE << compound_order(page))));
+ min(vma->vm_end, address + page_size(page)));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1524,8 +1524,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
- int nr = 1 << compound_order(page);
- hugetlb_count_sub(nr, mm);
+ hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
diff --git a/mm/shmem.c b/mm/shmem.c
index 575e1590faab..77d2df011c0e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -607,7 +607,7 @@ static int shmem_add_to_page_cache(struct page *page,
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
- unsigned long nr = 1UL << compound_order(page);
+ unsigned long nr = compound_nr(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -629,7 +629,7 @@ static int shmem_add_to_page_cache(struct page *page,
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
@@ -1732,7 +1732,7 @@ unlock:
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vmf and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1882,7 +1882,7 @@ alloc_nohuge:
lru_cache_add_anon(page);
spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
+ info->alloced += compound_nr(page);
inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1923,7 +1923,7 @@ clear:
struct page *head = compound_head(page);
int i;
- for (i = 0; i < (1 << compound_order(head)); i++) {
+ for (i = 0; i < compound_nr(head); i++) {
clear_highpage(head + i);
flush_dcache_page(head + i);
}
@@ -1950,7 +1950,7 @@ clear:
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
if (PageTransHuge(page)) {
unlock_page(page);
diff --git a/mm/slab.h b/mm/slab.h
index 9057b8056b07..68e455f2b698 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,6 +30,69 @@ struct kmem_cache {
struct list_head list; /* List of all slab caches on the system */
};
+#else /* !CONFIG_SLOB */
+
+struct memcg_cache_array {
+ struct rcu_head rcu;
+ struct kmem_cache *entries[0];
+};
+
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
+ *
+ * Root and child caches hold different metadata.
+ *
+ * @root_cache: Common to root and child caches. NULL for root, pointer to
+ * the root cache for children.
+ *
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches. This table is
+ * used to index child cachces during allocation and cleared
+ * early during shutdown.
+ *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
+ * @children: List of all child caches. While the child caches are also
+ * reachable through @memcg_caches, a child cache remains on
+ * this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg: Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
+ */
+struct memcg_cache_params {
+ struct kmem_cache *root_cache;
+ union {
+ struct {
+ struct memcg_cache_array __rcu *memcg_caches;
+ struct list_head __root_caches_node;
+ struct list_head children;
+ bool dying;
+ };
+ struct {
+ struct mem_cgroup *memcg;
+ struct list_head children_node;
+ struct list_head kmem_caches_node;
+ struct percpu_ref refcnt;
+
+ void (*work_fn)(struct kmem_cache *);
+ union {
+ struct rcu_head rcu_head;
+ struct work_struct work;
+ };
+ };
+ };
+};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -174,6 +237,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
+void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807490fe217a..c29f03adca91 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/**
+ * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
+ * @s: The cache pointer
+ */
+void kmem_cache_shrink_all(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+
+ if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
+ kmem_cache_shrink(s);
+ return;
+ }
+
+ get_online_cpus();
+ get_online_mems();
+ kasan_cache_shrink(s);
+ __kmem_cache_shrink(s);
+
+ /*
+ * We have to take the slab_mutex to protect from the memcg list
+ * modification.
+ */
+ mutex_lock(&slab_mutex);
+ for_each_memcg_cache(c, s) {
+ /*
+ * Don't need to shrink deactivated memcg caches.
+ */
+ if (s->flags & SLAB_DEACTIVATED)
+ continue;
+ kasan_cache_shrink(c);
+ __kmem_cache_shrink(c);
+ }
+ mutex_unlock(&slab_mutex);
+ put_online_mems();
+ put_online_cpus();
+}
+
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -993,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
unsigned int useroffset, unsigned int usersize)
{
int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
s->name = name;
s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
s->useroffset = useroffset;
s->usersize = usersize;
@@ -1250,12 +1296,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
- void *ret;
+ void *ret = NULL;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
- ret = page ? page_address(page) : NULL;
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
diff --git a/mm/slob.c b/mm/slob.c
index 7f421d0ca9ab..fa53e9f73893 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- void *page;
+ struct page *page;
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
@@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
return page_address(page);
}
static void slob_free_pages(void *b, int order)
{
+ struct page *sp = virt_to_page(b);
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- free_pages((unsigned long)b, order);
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
}
/*
@@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order)
* @sp: Page to look in.
* @size: Size of the allocation.
* @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
* @page_removed_from_list: Return parameter.
*
* Tries to find a chunk of memory at least @size bytes big within @page.
@@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order)
* true (set to false otherwise).
*/
static void *slob_page_alloc(struct page *sp, size_t size, int align,
- bool *page_removed_from_list)
+ int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
@@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
if (align) {
- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
delta = aligned - cur;
}
if (avail >= units + delta) { /* room enough? */
@@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
/*
* slob_alloc: entry point into the slob allocator.
*/
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
{
struct page *sp;
struct list_head *slob_list;
@@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- b = slob_page_alloc(sp, size, align, &page_removed_from_list);
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
if (!b)
continue;
@@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align, &_unused);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
@@ -451,7 +469,7 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
fs_reclaim_acquire(gfp);
fs_reclaim_release(gfp);
- if (size < PAGE_SIZE - align) {
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
if (!size)
return ZERO_SIZE_PTR;
- m = slob_alloc(size + align, gfp, align, node);
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
if (!m)
return NULL;
*m = size;
- ret = (void *)m + align;
+ ret = (void *)m + minalign;
trace_kmalloc_node(caller, ret,
- size, size + align, gfp, node);
+ size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -521,8 +548,13 @@ void kfree(const void *block)
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
- } else
- __free_pages(sp, compound_order(sp));
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
+
+ }
}
EXPORT_SYMBOL(kfree);
@@ -539,7 +571,7 @@ size_t __ksize(const void *block)
sp = virt_to_page(block);
if (unlikely(!PageSlab(sp)))
- return PAGE_SIZE << compound_order(sp);
+ return page_size(sp);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
@@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node);
+ b = slob_alloc(c->size, flags, c->align, node, 0);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
diff --git a/mm/slub.c b/mm/slub.c
index 8834563cdb4b..c9856a9807f1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = PAGE_SIZE << compound_order(page);
+ length = page_size(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
if (!(s->flags & SLAB_POISON))
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, PAGE_SIZE << order);
+ memset(addr, POISON_INUSE, page_size(page));
metadata_access_disable();
}
@@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
-static inline void setup_page_debug(struct kmem_cache *s,
- void *addr, int order) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
- int idx, order;
+ int idx;
bool shuffle;
flags &= gfp_allowed_mask;
@@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
- order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
@@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
- setup_page_debug(s, start, order);
+ setup_page_debug(s, page, start);
shuffle = shuffle_freelist(s, page);
@@ -3819,11 +3819,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
return kmalloc_large_node_hook(ptr, size, flags);
}
@@ -3930,7 +3934,7 @@ size_t __ksize(const void *object)
if (unlikely(!PageSlab(page))) {
WARN_ON(!PageCompound(page));
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
return slab_ksize(page->slab_cache);
@@ -3949,9 +3953,13 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -5298,7 +5306,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink(s);
+ kmem_cache_shrink_all(s);
else
return -EINVAL;
return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 72f010d9bff5..a1679024ab5c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -470,6 +470,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+ WARN_ON(!sparsemap_buf || size == 0);
+ memblock_free_early(__pa(sparsemap_buf), size);
+}
+
static void __init sparse_buffer_init(unsigned long size, int nid)
{
phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
@@ -486,7 +492,7 @@ static void __init sparse_buffer_fini(void)
unsigned long size = sparsemap_buf_end - sparsemap_buf;
if (sparsemap_buf && size > 0)
- memblock_free_early(__pa(sparsemap_buf), size);
+ sparse_buffer_free(size);
sparsemap_buf = NULL;
}
@@ -495,11 +501,15 @@ void * __meminit sparse_buffer_alloc(unsigned long size)
void *ptr = NULL;
if (sparsemap_buf) {
- ptr = PTR_ALIGN(sparsemap_buf, size);
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
if (ptr + size > sparsemap_buf_end)
ptr = NULL;
- else
+ else {
+ /* Free redundant aligned space */
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
sparsemap_buf = ptr + size;
+ }
}
return ptr;
}
@@ -867,7 +877,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
*/
page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
- ms = __pfn_to_section(start_pfn);
+ ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
section_mark_present(ms);
diff --git a/mm/swap.c b/mm/swap.c
index ae300397dfda..38c3fa4308e2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -71,12 +72,12 @@ static void __page_cache_release(struct page *page)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
}
__ClearPageWaiters(page);
- mem_cgroup_uncharge(page);
}
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
+ mem_cgroup_uncharge(page);
free_unref_page(page);
}
@@ -515,7 +516,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
@@ -523,13 +523,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
+ add_page_to_lru_list(page, lruvec, lru);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- list_move_tail(&page->lru, &lruvec->lists[lru]);
+ add_page_to_lru_list_tail(page, lruvec, lru);
__count_vm_event(PGROTATED);
}
@@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
@@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page)
}
}
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
@@ -687,6 +728,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
@@ -844,17 +886,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
get_page(page_tail);
list_add_tail(&page_tail->lru, list);
} else {
- struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
* so we must account for each subpage individually.
*
- * Use the standard add function to put page_tail on the list,
- * but then correct its position so they all end up in order.
+ * Put page_tail on the list at the correct position
+ * so they all end up in order.
*/
- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
- list_head = page_tail->lru.prev;
- list_move_tail(&page_tail->lru, list_head);
+ add_page_to_lru_list_tail(page_tail, lruvec,
+ page_lru(page_tail));
}
if (!PageUnevictable(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8368621a0fc7..8e7ce9a9bc5e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = 1UL << compound_order(page);
+ unsigned long i, nr = compound_nr(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
@@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dab43523afdd..80445f4f111e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1880,8 +1880,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
mem_cgroup_commit_charge(page, memcg, true, false);
@@ -1890,6 +1888,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
/*
* Move the page to the active list so it is not
diff --git a/mm/util.c b/mm/util.c
index e6351a80f248..3ad6db9a722e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,6 +16,13 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -293,7 +300,105 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+ unsigned long random_variable = 0;
+
+ if (current->flags & PF_RANDOMIZE) {
+ random_variable = get_random_long();
+ random_variable &= STACK_RND_MASK;
+ random_variable <<= PAGE_SHIFT;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ return PAGE_ALIGN(stack_top) + random_variable;
+#else
+ return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /* Is the current task 32bit ? */
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ if (is_compat_task())
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP (SZ_128M)
+#define MAX_GAP (STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_guard_gap;
+
+ /* Account for stack randomization if necessary */
+ if (current->flags & PF_RANDOMIZE)
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -521,7 +626,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < (1 << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
@@ -783,3 +888,16 @@ out_mm:
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7ba11e12a11f..b8101030f79e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -329,8 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
-#define VM_LAZY_FREE 0x02
-#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
@@ -398,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va)
get_subtree_max_size(va->rb_node.rb_right));
}
-RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
- struct vmap_area, rb_node, unsigned long, subtree_max_size,
- compute_subtree_max_size)
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
@@ -1116,7 +1113,7 @@ retry:
va->va_start = addr;
va->va_end = addr + size;
- va->flags = 0;
+ va->vm = NULL;
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -1282,7 +1279,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- __free_vmap_area(va);
+ /*
+ * Finally insert or merge lazily-freed area. It is
+ * detached and there is no need to "unlink" it from
+ * anything.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
@@ -1324,6 +1328,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
{
unsigned long nr_lazy;
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
@@ -1918,7 +1926,6 @@ void __init vmalloc_init(void)
if (WARN_ON_ONCE(!va))
continue;
- va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
@@ -2016,7 +2023,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
- va->flags |= VM_VM_AREA;
spin_unlock(&vmap_area_lock);
}
@@ -2121,10 +2127,10 @@ struct vm_struct *find_vm_area(const void *addr)
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA)
- return va->vm;
+ if (!va)
+ return NULL;
- return NULL;
+ return va->vm;
}
/**
@@ -2143,14 +2149,12 @@ struct vm_struct *remove_vm_area(const void *addr)
might_sleep();
- va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr);
+ if (va && va->vm) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
kasan_free_shadow(vm);
@@ -2158,6 +2162,8 @@ struct vm_struct *remove_vm_area(const void *addr)
return vm;
}
+
+ spin_unlock(&vmap_area_lock);
return NULL;
}
@@ -2851,7 +2857,7 @@ long vread(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -2931,7 +2937,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -3450,6 +3456,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
}
}
+static void show_purge_info(struct seq_file *m)
+{
+ struct llist_node *head;
+ struct vmap_area *va;
+
+ head = READ_ONCE(vmap_purge_list.first);
+ if (head == NULL)
+ return;
+
+ llist_for_each_entry(va, head, purge_list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+}
+
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va;
@@ -3458,14 +3480,13 @@ static int s_show(struct seq_file *m, void *p)
va = list_entry(p, struct vmap_area, list);
/*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
+ * s_show can encounter race with remove_vm_area, !vm on behalf
+ * of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ if (!va->vm) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start,
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+ va->va_end - va->va_start);
return 0;
}
@@ -3501,6 +3522,16 @@ static int s_show(struct seq_file *m, void *p)
show_numa_info(m, v);
seq_putc(m, '\n');
+
+ /*
+ * As a final step, dump "unpurged" areas. Note,
+ * that entire "/proc/vmallocinfo" output will not
+ * be address sorted, because the purge list is not
+ * sorted.
+ */
+ if (list_is_last(&va->list, &vmap_area_list))
+ show_purge_info(m);
+
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c77d1e3761a7..127e1e4cda50 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -171,11 +171,22 @@ int vm_swappiness = 60;
*/
unsigned long vm_total_pages;
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
/*
* We allow subsystems to populate their shrinker-related
* LRU lists before register_shrinker_prepared() is called
@@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
idr_remove(&shrinker_idr, id);
up_write(&shrinker_rwsem);
}
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return 0;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-static void set_task_reclaim_state(struct task_struct *task,
- struct reclaim_state *rs)
-{
- /* Check for an overwrite */
- WARN_ON_ONCE(rs && task->reclaim_state);
-
- /* Check for the nulling of an already-nulled member */
- WARN_ON_ONCE(!rs && !task->reclaim_state);
-
- task->reclaim_state = rs;
-}
-#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
@@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat,
}
#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
static bool global_reclaim(struct scan_control *sc)
{
return true;
@@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
@@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
unsigned long ret, freed = 0;
int i;
- if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
@@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
continue;
}
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, map->map);
@@ -661,13 +663,13 @@ unlock:
up_read(&shrinker_rwsem);
return freed;
}
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
@@ -1121,7 +1123,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -1135,7 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
@@ -1149,7 +1151,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
VM_BUG_ON_PAGE(PageActive(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
/* Account the number of base pages even though THP */
sc->nr_scanned += nr_pages;
@@ -1266,7 +1268,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
@@ -1487,10 +1489,9 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page))) {
- mem_cgroup_uncharge(page);
+ if (unlikely(PageTransHuge(page)))
(*get_compound_page_dtor(page))(page);
- } else
+ else
list_add(&page->lru, &free_pages);
continue;
@@ -1705,7 +1706,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
VM_BUG_ON_PAGE(!PageLRU(page), page);
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
@@ -1911,7 +1912,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&pgdat->lru_lock);
} else
@@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = -1;
+ unsigned long nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == -1) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = -1;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
@@ -2248,8 +2304,7 @@ enum scan_balance {
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *nr,
- unsigned long *lru_pages)
+ struct scan_control *sc, unsigned long *nr)
{
int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -2400,20 +2455,72 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- *lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
+
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2433,7 +2540,7 @@ out:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2442,7 +2549,6 @@ out:
BUG();
}
- *lru_pages += size;
nr[lru] = scan;
}
}
@@ -2451,7 +2557,7 @@ out:
* This is a basic per-node page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
- struct scan_control *sc, unsigned long *lru_pages)
+ struct scan_control *sc)
{
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
unsigned long nr[NR_LRU_LISTS];
@@ -2463,7 +2569,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2586,7 +2692,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
- unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
@@ -2597,40 +2702,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
if (!in_reclaim_compaction(sc))
return false;
- /* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
- /*
- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
- * full LRU list has been scanned and we are still failing
- * to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
- } else {
- /*
- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
- * fail without consequence, stop if we failed to reclaim
- * any pages from the last SWAP_CLUSTER_MAX number of
- * pages that were scanned. This will return to the
- * caller faster at the risk reclaim/compaction and
- * the resulting allocation attempt fails
- */
- if (!nr_reclaimed)
- return false;
- }
-
/*
- * If we have not reclaimed enough pages for compaction and the
- * inactive lists are large enough, continue reclaiming
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
*/
- pages_for_compaction = compact_gap(sc->order);
- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed < pages_for_compaction &&
- inactive_lru_pages > pages_for_compaction)
- return true;
+ if (!nr_reclaimed)
+ return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2647,7 +2730,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
;
}
}
- return true;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
}
static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2664,11 +2757,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .pgdat = pgdat,
- .priority = sc->priority,
- };
- unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
memset(&sc->nr, 0, sizeof(sc->nr));
@@ -2676,9 +2764,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ memcg = mem_cgroup_iter(root, NULL, NULL);
do {
- unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
@@ -2703,13 +2790,19 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
- node_lru_pages += lru_pages;
+ shrink_node_memcg(pgdat, memcg, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
sc->priority);
@@ -2719,21 +2812,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- /*
- * Kswapd have to scan all memory cgroups to fulfill
- * the overall scan target for the node.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!current_is_kswapd() &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2810,7 +2889,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ sc));
/*
* Kswapd gives up on balancing particular nodes after too
@@ -3233,14 +3312,15 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
- unsigned long lru_pages;
set_task_reclaim_state(current, &sc.reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.gfp_mask);
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(
+ cgroup_ino(memcg->css.cgroup),
+ sc.order,
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3249,9 +3329,11 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+ shrink_node_memcg(pgdat, memcg, &sc);
- trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(
+ cgroup_ino(memcg->css.cgroup),
+ sc.nr_reclaimed);
set_task_reclaim_state(current, NULL);
*nr_scanned = sc.nr_scanned;
@@ -3291,7 +3373,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ trace_mm_vmscan_memcg_reclaim_begin(
+ cgroup_ino(memcg->css.cgroup),
+ 0, sc.gfp_mask);
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
@@ -3301,7 +3385,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ trace_mm_vmscan_memcg_reclaim_end(
+ cgroup_ino(memcg->css.cgroup),
+ nr_reclaimed);
set_task_reclaim_state(current, NULL);
return nr_reclaimed;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16ca6996..6afc892a148a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = {
"nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
+ "nr_file_hugepages",
+ "nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
"nr_unstable",
"nr_vmscan_write",
diff --git a/mm/z3fold.c b/mm/z3fold.c
index e31cd9bd4ed5..75b7962439ff 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1406,6 +1406,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
* should freak out.
*/
WARN(1, "Z3fold is experiencing kref problems\n");
+ z3fold_page_unlock(zhdr);
return false;
}
z3fold_page_unlock(zhdr);
diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd9107857d..863669212070 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,6 +239,22 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+ return zpool->driver->malloc_support_movable;
+}
+
+/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 08def3a0d200..646858efd468 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool)
}
static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .type = "zsmalloc",
+ .owner = THIS_MODULE,
+ .create = zs_zpool_create,
+ .destroy = zs_zpool_destroy,
+ .malloc_support_movable = true,
+ .malloc = zs_zpool_malloc,
+ .free = zs_zpool_free,
+ .map = zs_zpool_map,
+ .unmap = zs_zpool_unmap,
+ .total_size = zs_zpool_total_size,
};
MODULE_ALIAS("zpool-zsmalloc");
@@ -2412,7 +2413,9 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
+#ifdef CONFIG_COMPACTION
init_waitqueue_head(&pool->migration_wait);
+#endif
if (create_cache(pool))
goto err;
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744a76cb..08b6cefae5d8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -997,6 +997,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ gfp_t gfp;
/* THP isn't supported */
if (PageTransHuge(page)) {
@@ -1070,9 +1071,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
- __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
- &handle);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(entry->pool->zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 93f382974bd0..fcdc66c39d99 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem)
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
- unsigned int i;
-
- for (i = 0; i < umem->npgs; i++) {
- struct page *page = umem->pgs[i];
-
- set_page_dirty_lock(page);
- put_page(page);
- }
+ put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
kfree(umem->pgs);
umem->pgs = NULL;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ee4428a892fa..f5a9a7224fc2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -895,7 +895,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
/* Matches the smp_wmb() in xsk_init_queue */
smp_rmb();
qpg = virt_to_head_page(q->ring);
- if (size > (PAGE_SIZE << compound_order(qpg)))
+ if (size > page_size(qpg))
return -EINVAL;
pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 93a7edfe0f05..f4b6127ff469 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -605,6 +605,20 @@ foreach my $entry (keys %deprecated_apis) {
}
$deprecated_apis_search = "(?:${deprecated_apis_search})";
+our %deprecated_string_apis = (
+ "strcpy" => "stracpy or strscpy",
+ "strlcpy" => "stracpy or strscpy",
+ "strncpy" => "stracpy or strscpy - for non-NUL-terminated uses, strncpy dest should be __nonstring",
+);
+
+#Create a search pattern for all these strings apis to speed up a loop below
+our $deprecated_string_apis_search = "";
+foreach my $entry (keys %deprecated_string_apis) {
+ $deprecated_string_apis_search .= '|' if ($deprecated_string_apis_search ne "");
+ $deprecated_string_apis_search .= $entry;
+}
+$deprecated_string_apis_search = "(?:${deprecated_string_apis_search})";
+
our $mode_perms_world_writable = qr{
S_IWUGO |
S_IWOTH |
@@ -2725,8 +2739,10 @@ sub process {
($line =~ /^\s*(?:WARNING:|BUG:)/ ||
$line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
# timestamp
- $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
- # stack dump address
+ $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) ||
+ $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ ||
+ $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) {
+ # stack dump address styles
$commit_log_possible_stack_dump = 1;
}
@@ -2898,6 +2914,17 @@ sub process {
}
}
+# check for invalid commit id
+ if ($in_commit_log && $line =~ /(^fixes:|\bcommit)\s+([0-9a-f]{6,40})\b/i) {
+ my $id;
+ my $description;
+ ($id, $description) = git_commit_info($2, undef, undef);
+ if (!defined($id)) {
+ WARN("UNKNOWN_COMMIT_ID",
+ "Unknown commit id '$2', maybe rebased or not pulled?\n" . $herecurr);
+ }
+ }
+
# ignore non-hunk lines and lines being removed
next if (!$hunk_line || $line =~ /^-/);
@@ -3069,21 +3096,21 @@ sub process {
# check SPDX comment style for .[chsS] files
if ($realfile =~ /\.[chsS]$/ &&
$rawline =~ /SPDX-License-Identifier:/ &&
- $rawline !~ /^\+\s*\Q$comment\E\s*/) {
+ $rawline !~ m@^\+\s*\Q$comment\E\s*@) {
WARN("SPDX_LICENSE_TAG",
"Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr);
}
if ($comment !~ /^$/ &&
- $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
- WARN("SPDX_LICENSE_TAG",
- "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+ $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) {
+ WARN("SPDX_LICENSE_TAG",
+ "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
} elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
- my $spdx_license = $1;
- if (!is_SPDX_License_valid($spdx_license)) {
- WARN("SPDX_LICENSE_TAG",
- "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
- }
+ my $spdx_license = $1;
+ if (!is_SPDX_License_valid($spdx_license)) {
+ WARN("SPDX_LICENSE_TAG",
+ "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+ }
}
}
}
@@ -4660,7 +4687,7 @@ sub process {
# closing brace should have a space following it when it has anything
# on the line
- if ($line =~ /}(?!(?:,|;|\)))\S/) {
+ if ($line =~ /}(?!(?:,|;|\)|\}))\S/) {
if (ERROR("SPACING",
"space required after that close brace '}'\n" . $herecurr) &&
$fix) {
@@ -5191,7 +5218,7 @@ sub process {
next if ($arg =~ /\.\.\./);
next if ($arg =~ /^type$/i);
my $tmp_stmt = $define_stmt;
- $tmp_stmt =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
+ $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
$tmp_stmt =~ s/\#+\s*$arg\b//g;
$tmp_stmt =~ s/\b$arg\s*\#\#//g;
my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g;
@@ -5873,6 +5900,18 @@ sub process {
"__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr);
}
+# Check for __attribute__ section, prefer __section
+ if ($realfile !~ m@\binclude/uapi/@ &&
+ $line =~ /\b__attribute__\s*\(\s*\(.*_*section_*\s*\(\s*("[^"]*")/) {
+ my $old = substr($rawline, $-[1], $+[1] - $-[1]);
+ my $new = substr($old, 1, -1);
+ if (WARN("PREFER_SECTION",
+ "__section($new) is preferred over __attribute__((section($old)))\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*_*section_*\s*\(\s*\Q$old\E\s*\)\s*\)\s*\)/__section($new)/;
+ }
+ }
+
# Check for __attribute__ format(printf, prefer __printf
if ($realfile !~ m@\binclude/uapi/@ &&
$line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) {
@@ -6446,6 +6485,16 @@ sub process {
"Deprecated use of '$deprecated_api', prefer '$new_api' instead\n" . $herecurr);
}
+# check for string deprecated apis
+ if ($line =~ /\b($deprecated_string_apis_search)\b\s*\(/) {
+ my $deprecated_string_api = $1;
+ my $new_api = $deprecated_string_apis{$deprecated_string_api};
+ my $msg_level = \&WARN;
+ $msg_level = \&CHK if ($file);
+ &{$msg_level}("DEPRECATED_API",
+ "Deprecated use of '$deprecated_string_api', prefer '$new_api' instead\n" . $herecurr);
+ }
+
# check for various structs that are normally const (ops, kgdb, device_tree)
# and avoid what seem like struct definitions 'struct foo {'
if ($line !~ /\bconst\b/ &&
diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py
index 2f5b95f09fa0..34e40e96dee2 100644
--- a/scripts/gdb/linux/symbols.py
+++ b/scripts/gdb/linux/symbols.py
@@ -77,12 +77,12 @@ lx-symbols command."""
gdb.write("scanning for modules in {0}\n".format(path))
for root, dirs, files in os.walk(path):
for name in files:
- if name.endswith(".ko"):
+ if name.endswith(".ko") or name.endswith(".ko.debug"):
self.module_files.append(root + "/" + name)
self.module_files_updated = True
def _get_module_file(self, module_name):
- module_pattern = ".*/{0}\.ko$".format(
+ module_pattern = ".*/{0}\.ko(?:.debug)?$".format(
module_name.replace("_", r"[_\-]"))
for name in self.module_files:
if re.match(module_pattern, name) and os.path.exists(name):
diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h
index d83763a5327c..e03b1ea23e0e 100644
--- a/tools/include/linux/rbtree.h
+++ b/tools/include/linux/rbtree.h
@@ -31,25 +31,9 @@ struct rb_root {
struct rb_node *rb_node;
};
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
-};
-
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define RB_ROOT (struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
@@ -71,12 +55,6 @@ extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
-extern void rb_insert_color_cached(struct rb_node *,
- struct rb_root_cached *, bool);
-extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
@@ -84,8 +62,6 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
-extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
- struct rb_root_cached *root);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
@@ -129,4 +105,51 @@ static inline void rb_erase_init(struct rb_node *n, struct rb_root *root)
rb_erase(n, root);
RB_CLEAR_NODE(n);
}
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+ struct rb_root_cached *root,
+ bool leftmost)
+{
+ if (leftmost)
+ root->rb_leftmost = node;
+ rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+ struct rb_root_cached *root)
+{
+ if (root->rb_leftmost == node)
+ root->rb_leftmost = rb_next(node);
+ rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+ struct rb_node *new,
+ struct rb_root_cached *root)
+{
+ if (root->rb_leftmost == victim)
+ root->rb_leftmost = new;
+ rb_replace_node(victim, new, &root->rb_root);
+}
+
#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h
index ddd01006ece5..381aa948610d 100644
--- a/tools/include/linux/rbtree_augmented.h
+++ b/tools/include/linux/rbtree_augmented.h
@@ -32,17 +32,16 @@ struct rb_augment_callbacks {
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
-extern void __rb_insert_augmented(struct rb_node *node,
- struct rb_root *root,
- bool newleft, struct rb_node **leftmost,
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
/*
* Fixup the rbtree and update the augmented information when rebalancing.
*
* On insertion, the user must update the augmented information on the path
* leading to the inserted node, then call rb_link_node() as usual and
- * rb_augment_inserted() instead of the usual rb_insert_color() call.
- * If rb_augment_inserted() rebalances the rbtree, it will callback into
+ * rb_insert_augmented() instead of the usual rb_insert_color() call.
+ * If rb_insert_augmented() rebalances the rbtree, it will callback into
* a user provided function to update the augmented information on the
* affected subtrees.
*/
@@ -50,7 +49,7 @@ static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- __rb_insert_augmented(node, root, false, NULL, augment->rotate);
+ __rb_insert_augmented(node, root, augment->rotate);
}
static inline void
@@ -58,45 +57,92 @@ rb_insert_augmented_cached(struct rb_node *node,
struct rb_root_cached *root, bool newleft,
const struct rb_augment_callbacks *augment)
{
- __rb_insert_augmented(node, &root->rb_root,
- newleft, &root->rb_leftmost, augment->rotate);
+ if (newleft)
+ root->rb_leftmost = node;
+ rb_insert_augmented(node, &root->rb_root, augment);
}
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
- rbtype, rbaugmented, rbcompute) \
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC: 'static' or empty
+ * RBNAME: name of the rb_augment_callbacks structure
+ * RBSTRUCT: struct type of the tree nodes
+ * RBFIELD: name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
+ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
static inline void \
-rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
{ \
while (rb != stop) { \
- rbstruct *node = rb_entry(rb, rbstruct, rbfield); \
- rbtype augmented = rbcompute(node); \
- if (node->rbaugmented == augmented) \
+ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
+ if (RBCOMPUTE(node, true)) \
break; \
- node->rbaugmented = augmented; \
- rb = rb_parent(&node->rbfield); \
+ rb = rb_parent(&node->RBFIELD); \
} \
} \
static inline void \
-rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
- rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
- rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
- new->rbaugmented = old->rbaugmented; \
+ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
+ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
+ new->RBAUGMENTED = old->RBAUGMENTED; \
} \
static void \
-rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
{ \
- rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \
- rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \
- new->rbaugmented = old->rbaugmented; \
- old->rbaugmented = rbcompute(old); \
+ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
+ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
+ new->RBAUGMENTED = old->RBAUGMENTED; \
+ RBCOMPUTE(old, false); \
} \
-rbstatic const struct rb_augment_callbacks rbname = { \
- .propagate = rbname ## _propagate, \
- .copy = rbname ## _copy, \
- .rotate = rbname ## _rotate \
+RBSTATIC const struct rb_augment_callbacks RBNAME = { \
+ .propagate = RBNAME ## _propagate, \
+ .copy = RBNAME ## _copy, \
+ .rotate = RBNAME ## _rotate \
};
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC: 'static' or empty
+ * RBNAME: name of the rb_augment_callbacks structure
+ * RBSTRUCT: struct type of the tree nodes
+ * RBFIELD: name of struct rb_node field within RBSTRUCT
+ * RBTYPE: type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
+ RBTYPE, RBAUGMENTED, RBCOMPUTE) \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
+{ \
+ RBSTRUCT *child; \
+ RBTYPE max = RBCOMPUTE(node); \
+ if (node->RBFIELD.rb_left) { \
+ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
+ if (child->RBAUGMENTED > max) \
+ max = child->RBAUGMENTED; \
+ } \
+ if (node->RBFIELD.rb_right) { \
+ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
+ if (child->RBAUGMENTED > max) \
+ max = child->RBAUGMENTED; \
+ } \
+ if (exit && node->RBAUGMENTED == max) \
+ return true; \
+ node->RBAUGMENTED = max; \
+ return false; \
+} \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
+ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
#define RB_RED 0
#define RB_BLACK 1
@@ -139,7 +185,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
- struct rb_node **leftmost,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
@@ -147,9 +192,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
struct rb_node *parent, *rebalance;
unsigned long pc;
- if (leftmost && node == *leftmost)
- *leftmost = rb_next(node);
-
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
@@ -249,8 +291,7 @@ static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- struct rb_node *rebalance = __rb_erase_augmented(node, root,
- NULL, augment);
+ struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
@@ -259,11 +300,9 @@ static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
const struct rb_augment_callbacks *augment)
{
- struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
- &root->rb_leftmost,
- augment);
- if (rebalance)
- __rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+ if (root->rb_leftmost == node)
+ root->rb_leftmost = rb_next(node);
+ rb_erase_augmented(node, &root->rb_root, augment);
}
#endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */
diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c
index 804f145e3113..2548ff8c4d9c 100644
--- a/tools/lib/rbtree.c
+++ b/tools/lib/rbtree.c
@@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
- bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
- if (newleft)
- *leftmost = node;
-
while (true) {
/*
* Loop invariant: node is red.
@@ -436,34 +432,17 @@ static const struct rb_augment_callbacks dummy_callbacks = {
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
- __rb_insert(node, root, false, NULL, dummy_rotate);
+ __rb_insert(node, root, dummy_rotate);
}
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
- rebalance = __rb_erase_augmented(node, root,
- NULL, &dummy_callbacks);
+ rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
-void rb_insert_color_cached(struct rb_node *node,
- struct rb_root_cached *root, bool leftmost)
-{
- __rb_insert(node, &root->rb_root, leftmost,
- &root->rb_leftmost, dummy_rotate);
-}
-
-void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
- struct rb_node *rebalance;
- rebalance = __rb_erase_augmented(node, &root->rb_root,
- &root->rb_leftmost, &dummy_callbacks);
- if (rebalance)
- ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
-}
-
/*
* Augmented rbtree manipulation functions.
*
@@ -472,10 +451,9 @@ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
- bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
- __rb_insert(node, root, newleft, leftmost, augment_rotate);
+ __rb_insert(node, root, augment_rotate);
}
/*
@@ -580,15 +558,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
__rb_change_child(victim, new, parent, root);
}
-void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
- struct rb_root_cached *root)
-{
- rb_replace_node(victim, new, &root->rb_root);
-
- if (root->rb_leftmost == victim)
- root->rb_leftmost = new;
-}
-
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
{
for (;;) {
diff --git a/usr/Makefile b/usr/Makefile
index 6a89eb019275..e6f7cb2f81db 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y)
datafile_d_y = .$(datafile_y).d
AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)"
+# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all
+# possible compression formats.
+clean-files += initramfs_data.cpio*
# Generate builtin.o based on initramfs_data.o
obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o