summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2022-10-27 13:28:47 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2022-10-27 13:28:47 +1100
commit59855347bb4443814a886525983a7d092ea98bce (patch)
treed041c42ceffbfc96d6176f644afd455147652105
parent4173e2071d682b83b05d2f8bc5fdf8a538d6c406 (diff)
parent48b7c415997df9cb7ae08a3c509788fdda7b9ed4 (diff)
downloadlinux-59855347bb4443814a886525983a7d092ea98bce.tar.gz
linux-59855347bb4443814a886525983a7d092ea98bce.tar.xz
Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
-rw-r--r--.clang-format1
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst64
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst1
-rw-r--r--Documentation/fault-injection/fault-injection.rst10
-rw-r--r--Documentation/filesystems/proc.rst17
-rw-r--r--Documentation/mm/balance.rst2
-rw-r--r--MAINTAINERS14
-rw-r--r--arch/alpha/include/asm/pgtable.h2
-rw-r--r--arch/arc/include/asm/pgtable-bits-arcv2.h2
-rw-r--r--arch/arc/kernel/ptrace.c2
-rw-r--r--arch/arm/include/asm/pgtable-nommu.h2
-rw-r--r--arch/arm/include/asm/pgtable.h4
-rw-r--r--arch/arm/kernel/machine_kexec.c2
-rw-r--r--arch/arm/kernel/ptrace.c8
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/kernel/ptrace.c16
-rw-r--r--arch/arm64/mm/mmu.c47
-rw-r--r--arch/arm64/mm/pageattr.c3
-rw-r--r--arch/csky/include/asm/pgtable.h3
-rw-r--r--arch/hexagon/include/asm/page.h7
-rw-r--r--arch/hexagon/kernel/ptrace.c7
-rw-r--r--arch/ia64/include/asm/io.h4
-rw-r--r--arch/ia64/include/asm/kprobes.h2
-rw-r--r--arch/ia64/include/asm/pgtable.h16
-rw-r--r--arch/ia64/kernel/ptrace.c20
-rw-r--r--arch/ia64/kernel/sys_ia64.c6
-rw-r--r--arch/ia64/mm/hugetlbpage.c15
-rw-r--r--arch/loongarch/include/asm/pgtable.h2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h2
-rw-r--r--arch/m68k/include/asm/pgtable_no.h1
-rw-r--r--arch/microblaze/include/asm/pgtable.h3
-rw-r--r--arch/mips/include/asm/pgtable.h2
-rw-r--r--arch/mips/kernel/ptrace.c9
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/processor.h3
-rw-r--r--arch/nios2/kernel/ptrace.c6
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/kernel/ptrace.c8
-rw-r--r--arch/parisc/include/asm/pgtable.h15
-rw-r--r--arch/parisc/kernel/pdt.c5
-rw-r--r--arch/parisc/kernel/ptrace.c15
-rw-r--r--arch/powerpc/include/asm/pgtable.h7
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace-tm.c10
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace-view.c15
-rw-r--r--arch/powerpc/kexec/file_load_64.c2
-rw-r--r--arch/powerpc/kexec/ranges.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_uvmem.c12
-rw-r--r--arch/powerpc/mm/hugetlbpage.c37
-rw-r--r--arch/riscv/include/asm/pgtable.h2
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/sh/include/asm/pgtable.h2
-rw-r--r--arch/sh/kernel/ptrace_32.c8
-rw-r--r--arch/sparc/include/asm/pgtable_32.h6
-rw-r--r--arch/sparc/kernel/ptrace_32.c9
-rw-r--r--arch/sparc/kernel/ptrace_64.c23
-rw-r--r--arch/sparc/mm/init_32.c3
-rw-r--r--arch/sparc/mm/init_64.c1
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c4
-rw-r--r--arch/x86/mm/init_64.c41
-rw-r--r--arch/xtensa/include/asm/pgtable.h2
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/acpi/numa/hmat.c7
-rw-r--r--drivers/base/memory.c38
-rw-r--r--drivers/block/zram/Kconfig55
-rw-r--r--drivers/block/zram/zcomp.c6
-rw-r--r--drivers/block/zram/zcomp.h2
-rw-r--r--drivers/block/zram/zram_drv.c471
-rw-r--r--drivers/block/zram/zram_drv.h16
-rw-r--r--drivers/dax/Kconfig5
-rw-r--r--drivers/dax/Makefile1
-rw-r--r--drivers/dax/bus.c9
-rw-r--r--drivers/dax/dax-private.h2
-rw-r--r--drivers/dax/device.c73
-rw-r--r--drivers/dax/mapping.c1089
-rw-r--r--drivers/dax/super.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c15
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.c12
-rw-r--r--drivers/iommu/tegra-smmu.c4
-rw-r--r--drivers/nvdimm/Kconfig3
-rw-r--r--drivers/nvdimm/pmem.c47
-rw-r--r--fs/coredump.c5
-rw-r--r--fs/dax.c1069
-rw-r--r--fs/debugfs/file.c28
-rw-r--r--fs/ext4/inode.c9
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/fs_context.c17
-rw-r--r--fs/fs_parser.c16
-rw-r--r--fs/fuse/dax.c9
-rw-r--r--fs/hugetlbfs/inode.c39
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/netdebug.c2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c6
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c19
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c6
-rw-r--r--fs/proc/cmdline.c6
-rw-r--r--fs/proc/fd.c45
-rw-r--r--fs/proc/kcore.c33
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--include/linux/compiler-gcc.h21
-rw-r--r--include/linux/coredump.h1
-rw-r--r--include/linux/dax.h149
-rw-r--r--include/linux/debugfs.h19
-rw-r--r--include/linux/fs.h12
-rw-r--r--include/linux/fs_context.h3
-rw-r--r--include/linux/gfp_types.h12
-rw-r--r--include/linux/highmem.h24
-rw-r--r--include/linux/huge_mm.h26
-rw-r--r--include/linux/hugetlb.h119
-rw-r--r--include/linux/hugetlb_cgroup.h85
-rw-r--r--include/linux/init.h1
-rw-r--r--include/linux/kexec.h7
-rw-r--r--include/linux/memory-tiers.h1
-rw-r--r--include/linux/memory.h18
-rw-r--r--include/linux/memremap.h39
-rw-r--r--include/linux/minmax.h26
-rw-r--r--include/linux/mm.h95
-rw-r--r--include/linux/mm_types.h48
-rw-r--r--include/linux/mm_types_task.h13
-rw-r--r--include/linux/pagemap.h6
-rw-r--r--include/linux/pagewalk.h5
-rw-r--r--include/linux/percpu_counter.h1
-rw-r--r--include/linux/regset.h15
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/swapops.h24
-rw-r--r--include/trace/events/fs_dax.h16
-rw-r--r--include/trace/events/kmem.h8
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/trace/events/vmalloc.h123
-rw-r--r--init/main.c7
-rw-r--r--kernel/cgroup/cpuset.c7
-rw-r--r--kernel/fork.c16
-rw-r--r--kernel/kexec_core.c10
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/sysctl.c1
-rw-r--r--lib/Kconfig31
-rw-r--r--lib/Kconfig.debug1
-rw-r--r--lib/Kconfig.kasan2
-rw-r--r--lib/debugobjects.c10
-rw-r--r--lib/fault-inject.c22
-rw-r--r--lib/llist.c4
-rw-r--r--lib/maple_tree.c2
-rw-r--r--lib/notifier-error-inject.c2
-rw-r--r--lib/oid_registry.c1
-rw-r--r--lib/test_hmm.c10
-rw-r--r--lib/test_printf.c8
-rw-r--r--mm/damon/Makefile6
-rw-r--r--mm/damon/core.c262
-rw-r--r--mm/damon/lru_sort.c70
-rw-r--r--mm/damon/modules-common.c42
-rw-r--r--mm/damon/modules-common.h3
-rw-r--r--mm/damon/reclaim.c72
-rw-r--r--mm/damon/sysfs-common.c107
-rw-r--r--mm/damon/sysfs-common.h24
-rw-r--r--mm/damon/sysfs.c172
-rw-r--r--mm/debug_vm_pgtable.c8
-rw-r--r--mm/filemap.c28
-rw-r--r--mm/folio-compat.c7
-rw-r--r--mm/gup.c222
-rw-r--r--mm/gup_test.c141
-rw-r--r--mm/gup_test.h12
-rw-r--r--mm/huge_memory.c72
-rw-r--r--mm/hugetlb.c328
-rw-r--r--mm/hugetlb_cgroup.c63
-rw-r--r--mm/internal.h14
-rw-r--r--mm/kasan/kasan.h8
-rw-r--r--mm/kasan/kasan_test.c148
-rw-r--r--mm/kasan/kasan_test_module.c60
-rw-r--r--mm/kasan/report.c31
-rw-r--r--mm/kasan/shadow.c2
-rw-r--r--mm/ksm.c80
-rw-r--r--mm/memcontrol.c32
-rw-r--r--mm/memory-failure.c161
-rw-r--r--mm/memory-tiers.c2
-rw-r--r--mm/memory.c116
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/memremap.c114
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/mincore.c14
-rw-r--r--mm/mm_init.c8
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/page_alloc.c25
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/pagewalk.c27
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c23
-rw-r--r--mm/slub.c7
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swap.h8
-rw-r--r--mm/swap_state.c28
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c14
-rw-r--r--mm/workingset.c2
-rwxr-xr-xscripts/checkpatch.pl17
-rw-r--r--tools/perf/builtin-kmem.c1
-rw-r--r--tools/testing/radix-tree/maple.c5
-rw-r--r--tools/testing/selftests/damon/Makefile1
-rw-r--r--tools/testing/selftests/damon/lru_sort.sh41
-rw-r--r--tools/testing/selftests/damon/reclaim.sh42
-rw-r--r--tools/testing/selftests/proc/proc-uptime-002.c3
-rw-r--r--tools/testing/selftests/vm/.gitignore1
-rw-r--r--tools/testing/selftests/vm/Makefile27
-rw-r--r--tools/testing/selftests/vm/anon_cow.c1126
-rw-r--r--tools/testing/selftests/vm/check_config.sh31
-rw-r--r--tools/testing/selftests/vm/hugepage-mremap.c21
-rw-r--r--tools/testing/selftests/vm/hugetlb-madvise.c12
-rw-r--r--tools/testing/selftests/vm/ksm_functional_tests.c279
-rw-r--r--tools/testing/selftests/vm/ksm_tests.c76
-rw-r--r--tools/testing/selftests/vm/madv_populate.c8
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh231
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c62
-rw-r--r--tools/testing/selftests/vm/vm_util.c25
-rw-r--r--tools/testing/selftests/vm/vm_util.h3
229 files changed, 6098 insertions, 3527 deletions
diff --git a/.clang-format b/.clang-format
index 1247d54f9e49..767651ddc50c 100644
--- a/.clang-format
+++ b/.clang-format
@@ -136,6 +136,7 @@ ForEachMacros:
- 'data__for_each_file'
- 'data__for_each_file_new'
- 'data__for_each_file_start'
+ - 'dax_for_each_folio'
- 'device_for_each_child_node'
- 'displayid_iter_for_each'
- 'dma_fence_array_for_each'
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index c73b16930449..010fb05a5999 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -401,6 +401,61 @@ budget in next setting is user's job.
If admin wants to measure writeback count in a certain period, they could
know it via /sys/block/zram0/bd_stat's 3rd column.
+recompression
+-------------
+
+With CONFIG_ZRAM_MULTI_COMP, zram can recompress idle/huge pages using
+alternative (secondary) compression algorithm. The basic idea is that
+alternative compression algorithm can provide better compression ratio
+at a price of (potentially) slower compression/decompression speeds.
+Alternative compression algorithm can, for example, be more successful
+compressing huge pages (those that default algorithm failed to compress).
+Another application is idle pages recompression - pages that are cold and
+sit in the memory can be recompressed using more effective algorithm and,
+hence, reduce zsmalloc memory usage.
+
+With CONFIG_ZRAM_MULTI_COMP, zram will setup two compression algorithms
+per-CPU: primary and secondary ones. Primary zram compressor is explained
+in "3) Select compression algorithm", the secondary algorithm is configured
+in a similar way, using recomp_algorithm device attribute:
+
+Examples::
+
+ #show supported recompression algorithms
+ cat /sys/block/zramX/recomp_algorithm
+ zstd [lzo]
+
+ #select zstd recompression algorithm
+ echo zstd > /sys/block/zramX/recomp_algorithm
+
+Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress,
+which controls recompression:
+
+Examples::
+
+ #IDLE pages recompression is activated by `idle` mode
+ echo idle > /sys/block/zramX/recompress
+
+ #HUGE pages recompression is activated by `huge` mode
+ echo huge > /sys/block/zram0/recompress
+
+ #HUGE_IDLE pages recompression is activated by `huge_idle` mode
+ echo huge_idle > /sys/block/zramX/recompress
+
+The number of idle pages can be significant, so user-space can pass a size
+watermark value (in bytes) to the recompress knob, to filter out idle pages
+for recompression: zram will recompress only idle pages of equal or greater
+size:::
+
+ #recompress idle pages larger than 3000 bytes
+ echo 3000 > /sys/block/zramX/recompress
+
+ #recompress idle pages larger than 2000 bytes
+ echo 2000 > /sys/block/zramX/recompress
+
+Recompression is mostly focused on idle pages (except for huge pages
+recompression), so it works better in conjunction with memory tracking.
+
memory tracking
===============
@@ -411,9 +466,10 @@ pages of the process with*pagemap.
If you enable the feature, you could see block state via
/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
- 300 75.033841 .wh.
- 301 63.806904 s...
- 302 63.806919 ..hi
+ 300 75.033841 .wh..
+ 301 63.806904 s....
+ 302 63.806919 ..hi.
+ 303 62.801919 ....r
First column
zram's block index.
@@ -430,6 +486,8 @@ Third column
huge page
i:
idle page
+ r:
+ recompressed page (secondary compression algorithm)
First line of above example says 300th block is accessed at 75.033841sec
and the block's state is huge so it is written back to the backing
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index c9374009b076..d0291c334a90 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -176,6 +176,7 @@ core_pattern
%f executable filename
%E executable path
%c maximum size of core file by resource limit RLIMIT_CORE
+ %C CPU the task ran on
%<OTHER> both are dropped
======== ==========================================
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
index 17779a2772e5..5f6454b9dbd4 100644
--- a/Documentation/fault-injection/fault-injection.rst
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -83,9 +83,7 @@ configuration of fault-injection capabilities.
- /sys/kernel/debug/fail*/times:
specifies how many times failures may happen at most. A value of -1
- means "no limit". Note, though, that this file only accepts unsigned
- values. So, if you want to specify -1, you better use 'printf' instead
- of 'echo', e.g.: $ printf %#x -1 > times
+ means "no limit".
- /sys/kernel/debug/fail*/space:
@@ -284,7 +282,7 @@ Application Examples
echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
echo 10 > /sys/kernel/debug/$FAILTYPE/probability
echo 100 > /sys/kernel/debug/$FAILTYPE/interval
- printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times
+ echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
@@ -338,7 +336,7 @@ Application Examples
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 10 > /sys/kernel/debug/$FAILTYPE/probability
echo 100 > /sys/kernel/debug/$FAILTYPE/interval
- printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times
+ echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
@@ -369,7 +367,7 @@ Application Examples
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
- printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times
+ echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 898c99eae8e4..ec6cfdf1796a 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -47,6 +47,7 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
3.12 /proc/<pid>/arch_status - Task architecture specific information
+ 3.13 /proc/<pid>/fd - List of symlinks to open files
4 Configuring procfs
4.1 Mount options
@@ -2149,6 +2150,22 @@ AVX512_elapsed_ms
the task is unlikely an AVX512 user, but depends on the workload and the
scheduling scenario, it also could be a false negative mentioned above.
+3.13 /proc/<pid>/fd - List of symlinks to open files
+-------------------------------------------------------
+This directory contains symbolic links which represent open files
+the process is maintaining. Example output::
+
+ lr-x------ 1 root root 64 Sep 20 17:53 0 -> /dev/null
+ l-wx------ 1 root root 64 Sep 20 17:53 1 -> /dev/null
+ lrwx------ 1 root root 64 Sep 20 17:53 10 -> 'socket:[12539]'
+ lrwx------ 1 root root 64 Sep 20 17:53 11 -> 'socket:[12540]'
+ lrwx------ 1 root root 64 Sep 20 17:53 12 -> 'socket:[12542]'
+
+The number of open files for the process is stored in 'size' member
+of stat() output for /proc/<pid>/fd for fast access.
+-------------------------------------------------------
+
+
Chapter 4: Configuring procfs
=============================
diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst
index 6a1fadf3e173..e38e9d83c1c7 100644
--- a/Documentation/mm/balance.rst
+++ b/Documentation/mm/balance.rst
@@ -6,7 +6,7 @@ Memory Balancing
Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
-Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as
well as for non __GFP_IO allocations.
The first reason why a caller may avoid reclaim is that the caller can not
diff --git a/MAINTAINERS b/MAINTAINERS
index 3bff657e0d28..77a3b42f3fc6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12121,7 +12121,7 @@ M: Alexey Kodanev <alexey.kodanev@oracle.com>
L: ltp@lists.linux.it (subscribers-only)
S: Maintained
W: http://linux-test-project.github.io/
-T: git git://github.com/linux-test-project/ltp.git
+T: git https://github.com/linux-test-project/ltp.git
LYNX 28G SERDES PHY DRIVER
M: Ioana Ciornei <ioana.ciornei@nxp.com>
@@ -13325,10 +13325,20 @@ F: include/linux/memory_hotplug.h
F: include/linux/mm.h
F: include/linux/mmzone.h
F: include/linux/pagewalk.h
-F: include/linux/vmalloc.h
F: mm/
F: tools/testing/selftests/vm/
+VMALLOC
+M: Andrew Morton <akpm@linux-foundation.org>
+R: Uladzislau Rezki <urezki@gmail.com>
+R: Christoph Hellwig <hch@infradead.org>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: include/linux/vmalloc.h
+F: mm/vmalloc.c
+
MEMORY HOT(UN)PLUG
M: David Hildenbrand <david@redhat.com>
M: Oscar Salvador <osalvador@suse.de>
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 3ea9661c09ff..9e45f6735d5d 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -313,8 +313,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
#define pte_ERROR(e) \
printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
#define pmd_ERROR(e) \
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
index b23be557403e..515e82db519f 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -120,8 +120,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#include <asm/hugepage.h>
#endif
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index da7542cea0d8..2abdcd9b09e8 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -185,7 +185,7 @@ static int genregs_set(struct task_struct *target,
#define REG_IGNORE_ONE(LOC) \
if (!ret) \
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, \
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, \
offsetof(struct user_regs_struct, LOC), \
offsetof(struct user_regs_struct, LOC) + 4);
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index d16aba48fa0a..25d8c7bb07e0 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -21,8 +21,6 @@
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
#define pgd_clear(pgdp)
-#define kern_addr_valid(addr) (1)
-/* FIXME */
/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 78a532068fec..00954ab1a039 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -298,10 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-/* FIXME: this is not correct */
-#define kern_addr_valid(addr) (1)
-
/*
* We provide our own arch_get_unmapped_area to cope with VIPT caches.
*/
diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c
index f567032a09c0..a2e9ac763a9f 100644
--- a/arch/arm/kernel/machine_kexec.c
+++ b/arch/arm/kernel/machine_kexec.c
@@ -73,7 +73,7 @@ void machine_kexec_cleanup(struct kimage *image)
{
}
-void machine_crash_nonpanic_core(void *unused)
+static void machine_crash_nonpanic_core(void *unused)
{
struct pt_regs regs;
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index bfe88c6e60d5..2d8e2516906b 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -651,11 +651,9 @@ static int vfp_set(struct task_struct *target,
if (ret)
return ret;
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- user_fpregs_offset + sizeof(new_vfp.fpregs),
- user_fpscr_offset);
- if (ret)
- return ret;
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ user_fpregs_offset + sizeof(new_vfp.fpregs),
+ user_fpscr_offset);
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&new_vfp.fpscr,
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 71a1af42f0e8..4873c1d6e7d0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1021,8 +1021,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
*/
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-extern int kern_addr_valid(unsigned long addr);
-
#ifdef CONFIG_ARM64_MTE
#define __HAVE_ARCH_PREPARE_TO_SWAP
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index c2fb5755bbec..f3af3371280a 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -514,9 +514,7 @@ static int hw_break_set(struct task_struct *target,
/* Resource info and pad */
offset = offsetof(struct user_hwdebug_state, dbg_regs);
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset);
- if (ret)
- return ret;
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset);
/* (address, ctrl) registers */
limit = regset->n * regset->size;
@@ -543,11 +541,8 @@ static int hw_break_set(struct task_struct *target,
return ret;
offset += PTRACE_HBP_CTRL_SZ;
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- offset,
- offset + PTRACE_HBP_PAD_SZ);
- if (ret)
- return ret;
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ offset, offset + PTRACE_HBP_PAD_SZ);
offset += PTRACE_HBP_PAD_SZ;
idx++;
}
@@ -954,10 +949,7 @@ static int sve_set_common(struct task_struct *target,
start = end;
end = SVE_PT_SVE_FPSR_OFFSET(vq);
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- start, end);
- if (ret)
- goto out;
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end);
/*
* Copy fpsr, and fpcr which must follow contiguously in
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9a7c38965154..556154d821bf 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -814,53 +814,6 @@ void __init paging_init(void)
create_idmap();
}
-/*
- * Check whether a kernel address is valid (derived from arch/x86/).
- */
-int kern_addr_valid(unsigned long addr)
-{
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp, pud;
- pmd_t *pmdp, pmd;
- pte_t *ptep, pte;
-
- addr = arch_kasan_reset_tag(addr);
- if ((((long)addr) >> VA_BITS) != -1UL)
- return 0;
-
- pgdp = pgd_offset_k(addr);
- if (pgd_none(READ_ONCE(*pgdp)))
- return 0;
-
- p4dp = p4d_offset(pgdp, addr);
- if (p4d_none(READ_ONCE(*p4dp)))
- return 0;
-
- pudp = pud_offset(p4dp, addr);
- pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return 0;
-
- if (pud_sect(pud))
- return pfn_valid(pud_pfn(pud));
-
- pmdp = pmd_offset(pudp, addr);
- pmd = READ_ONCE(*pmdp);
- if (pmd_none(pmd))
- return 0;
-
- if (pmd_sect(pmd))
- return pfn_valid(pmd_pfn(pmd));
-
- ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
- if (pte_none(pte))
- return 0;
-
- return pfn_valid(pte_pfn(pte));
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static void free_hotplug_page_range(struct page *page, size_t size,
struct vmem_altmap *altmap)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index d107c3d434e2..0a741a910a6a 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -201,8 +201,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
/*
* This function is used to determine if a linear map page has been marked as
- * not-valid. Walk the page table and check the PTE_VALID bit. This is based
- * on kern_addr_valid(), which almost does what we need.
+ * not-valid. Walk the page table and check the PTE_VALID bit.
*
* Because this is only called on the kernel linear map, p?d_sect() implies
* p?d_present(). When debug_pagealloc is enabled, sections mappings are
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index c3d9b92cbe61..77bc6caff2d2 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -249,9 +249,6 @@ extern void paging_init(void);
void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
pte_t *pte);
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) (1)
-
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h
index 7cbf719c578e..d7d4f9fca327 100644
--- a/arch/hexagon/include/asm/page.h
+++ b/arch/hexagon/include/asm/page.h
@@ -131,13 +131,6 @@ static inline void clear_page(void *page)
#define page_to_virt(page) __va(page_to_phys(page))
-/*
- * For port to Hexagon Virtual Machine, MAYBE we check for attempts
- * to reference reserved HVM space, but in any case, the VM will be
- * protected.
- */
-#define kern_addr_valid(addr) (1)
-
#include <asm/mem-layout.h>
#include <asm-generic/memory_model.h>
/* XXX Todo: implement assembly-optimized version of getorder. */
diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c
index 8975f9b4cedf..125f19995b76 100644
--- a/arch/hexagon/kernel/ptrace.c
+++ b/arch/hexagon/kernel/ptrace.c
@@ -115,10 +115,9 @@ static int genregs_set(struct task_struct *target,
/* Ignore the rest, if needed */
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- offsetof(struct user_regs_struct, pad1), -1);
-
- if (ret)
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ offsetof(struct user_regs_struct, pad1), -1);
+ else
return ret;
/*
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
index ce66dfc0e719..83a492c8d298 100644
--- a/arch/ia64/include/asm/io.h
+++ b/arch/ia64/include/asm/io.h
@@ -23,10 +23,6 @@
#include <asm/unaligned.h>
#include <asm/early_ioremap.h>
-/* We don't use IO slowdowns on the ia64, but.. */
-#define __SLOW_DOWN_IO do { } while (0)
-#define SLOW_DOWN_IO do { } while (0)
-
#define __IA64_UNCACHED_OFFSET RGN_BASE(RGN_UNCACHED)
/*
diff --git a/arch/ia64/include/asm/kprobes.h b/arch/ia64/include/asm/kprobes.h
index c5cf5e4fb338..9e956768946c 100644
--- a/arch/ia64/include/asm/kprobes.h
+++ b/arch/ia64/include/asm/kprobes.h
@@ -110,8 +110,6 @@ extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
-extern void invalidate_stacked_regs(void);
-extern void flush_register_stack(void);
extern void arch_remove_kprobe(struct kprobe *p);
#endif /* CONFIG_KPROBES */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 6925e28ae61d..01517a5e6778 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -182,22 +182,6 @@ ia64_phys_addr_valid (unsigned long addr)
}
/*
- * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
- * memory. For the return value to be meaningful, ADDR must be >=
- * PAGE_OFFSET. This operation can be relatively expensive (e.g.,
- * require a hash-, or multi-level tree-lookup or something of that
- * sort) but it guarantees to return TRUE only if accessing the page
- * at that address does not cause an error. Note that there may be
- * addresses for which kern_addr_valid() returns FALSE even though an
- * access would not cause an error (e.g., this is typically true for
- * memory mapped I/O regions.
- *
- * XXX Need to implement this for IA-64.
- */
-#define kern_addr_valid(addr) (1)
-
-
-/*
* Now come the defines and routines to manage and access the three-level
* page table.
*/
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index ab8aeb34d1d9..4c41912c550f 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1481,12 +1481,10 @@ static void do_gpregs_set(struct unw_frame_info *info, void *arg)
return;
/* Skip r0 */
if (dst->pos < ELF_GR_OFFSET(1)) {
- dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count,
- &dst->u.set.kbuf,
- &dst->u.set.ubuf,
- 0, ELF_GR_OFFSET(1));
- if (dst->ret)
- return;
+ user_regset_copyin_ignore(&dst->pos, &dst->count,
+ &dst->u.set.kbuf, &dst->u.set.ubuf,
+ 0, ELF_GR_OFFSET(1));
+ dst->ret = 0;
}
while (dst->count && dst->pos < ELF_AR_END_OFFSET) {
@@ -1560,11 +1558,11 @@ static void do_fpregs_set(struct unw_frame_info *info, void *arg)
/* Skip pos 0 and 1 */
if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) {
- dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count,
- &dst->u.set.kbuf,
- &dst->u.set.ubuf,
- 0, ELF_FP_OFFSET(2));
- if (dst->count == 0 || dst->ret)
+ user_regset_copyin_ignore(&dst->pos, &dst->count,
+ &dst->u.set.kbuf, &dst->u.set.ubuf,
+ 0, ELF_FP_OFFSET(2));
+ dst->ret = 0;
+ if (dst->count == 0)
return;
}
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 215bf3f8cb20..f6a502e8f02c 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -140,7 +140,7 @@ asmlinkage unsigned long
sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
{
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
- if (!IS_ERR((void *) addr))
+ if (!IS_ERR_VALUE(addr))
force_successful_syscall_return();
return addr;
}
@@ -152,7 +152,7 @@ sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, lo
return -EINVAL;
addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
- if (!IS_ERR((void *) addr))
+ if (!IS_ERR_VALUE(addr))
force_successful_syscall_return();
return addr;
}
@@ -162,7 +162,7 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u
unsigned long new_addr)
{
addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
- if (!IS_ERR((void *) addr))
+ if (!IS_ERR_VALUE(addr))
force_successful_syscall_return();
return addr;
}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
return 0;
}
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
- struct page *page;
- pte_t *ptep;
-
- if (REGION_NUMBER(addr) != RGN_HPAGE)
- return ERR_PTR(-EINVAL);
-
- ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
- if (!ptep || pte_none(*ptep))
- return NULL;
- page = pte_page(*ptep);
- page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
- return page;
-}
int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 946704bee599..fc70b7041b76 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -421,8 +421,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
__update_tlb(vma, address, (pte_t *)pmdp);
}
-#define kern_addr_valid(addr) (1)
-
static inline unsigned long pmd_pfn(pmd_t pmd)
{
return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT;
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index 9b4e2fe2ac82..b93c41fe2067 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -145,8 +145,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#endif /* !__ASSEMBLY__ */
-#define kern_addr_valid(addr) (1)
-
/* MMU-specific headers */
#ifdef CONFIG_SUN3
diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index bce5ca56c388..fed58da3a6b6 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h
@@ -20,7 +20,6 @@
#define pgd_none(pgd) (0)
#define pgd_bad(pgd) (0)
#define pgd_clear(pgdp)
-#define kern_addr_valid(addr) (1)
#define pmd_offset(a, b) ((void *)0)
#define PAGE_NONE __pgprot(0)
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index ba348e997dbb..42f5988e998b 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -416,9 +416,6 @@ extern unsigned long iopa(unsigned long addr);
#define IOMAP_NOCACHE_NONSER 2
#define IOMAP_NO_COPYBACK 3
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) (1)
-
void do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code);
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 6caec386ad2f..364a06033105 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -550,8 +550,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
__update_tlb(vma, address, pte);
}
-#define kern_addr_valid(addr) (1)
-
/*
* Allow physical addresses to be fixed up to help 36-bit peripherals.
*/
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 567aec4abac0..d9df543f7e2c 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -531,10 +531,11 @@ static int fpr_set(struct task_struct *target,
ptrace_setfcr31(target, fcr31);
}
- if (count > 0)
- err = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- fir_pos,
- fir_pos + sizeof(u32));
+ if (count > 0) {
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ fir_pos, fir_pos + sizeof(u32));
+ return 0;
+ }
return err;
}
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index b3d45e815295..ab793bc517f5 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -249,8 +249,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
#define __swp_entry_to_pte(swp) ((pte_t) { (swp).val })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
-#define kern_addr_valid(addr) (1)
-
extern void __init paging_init(void);
extern void __init mmu_init(void);
diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h
index 8916d93d5c2d..eb44130364a9 100644
--- a/arch/nios2/include/asm/processor.h
+++ b/arch/nios2/include/asm/processor.h
@@ -50,9 +50,6 @@ struct thread_struct {
unsigned long kpsr;
};
-#define INIT_MMAP \
- { &init_mm, (0), (0), __pgprot(0x0), VM_READ | VM_WRITE | VM_EXEC }
-
# define INIT_THREAD { \
.kregs = NULL, \
.ksp = 0, \
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index cd62f310778b..9221c15972e6 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -54,7 +54,7 @@ static int genregs_set(struct task_struct *target,
#define REG_IGNORE_RANGE(START, END) \
if (!ret) \
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, \
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, \
START * 4, (END * 4) + 4);
#define REG_IN_ONE(PTR, LOC) \
@@ -80,8 +80,8 @@ static int genregs_set(struct task_struct *target,
REG_IN_ONE(&regs->ra, PTR_RA);
REG_IN_ONE(&regs->ea, PTR_PC); /* use ea for PC */
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- PTR_STATUS * 4, -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ PTR_STATUS * 4, -1);
return ret;
}
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index dcae8aea132f..6477c17b3062 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -395,8 +395,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
typedef pte_t *pte_addr_t;
#endif /* __ASSEMBLY__ */
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index b971740fc2aa..85ace93fc251 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -66,10 +66,9 @@ static int genregs_set(struct task_struct *target,
int ret;
/* ignore r0 */
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, 4);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, 4);
/* r1 - r31 */
- if (!ret)
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
regs->gpr+1, 4, 4*32);
/* PC */
if (!ret)
@@ -80,8 +79,7 @@ static int genregs_set(struct task_struct *target,
* the Supervision register
*/
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 4*33, -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 4*33, -1);
return ret;
}
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index ecd028854469..bd09a44cfb2d 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -23,21 +23,6 @@
#include <asm/processor.h>
#include <asm/cache.h>
-/*
- * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
- * memory. For the return value to be meaningful, ADDR must be >=
- * PAGE_OFFSET. This operation can be relatively expensive (e.g.,
- * require a hash-, or multi-level tree-lookup or something of that
- * sort) but it guarantees to return TRUE only if accessing the page
- * at that address does not cause an error. Note that there may be
- * addresses for which kern_addr_valid() returns FALSE even though an
- * access would not cause an error (e.g., this is typically true for
- * memory mapped I/O regions.
- *
- * XXX Need to implement this for parisc.
- */
-#define kern_addr_valid(addr) (1)
-
/* This is for the serialization of PxTLB broadcasts. At least on the N class
* systems, only one PxTLB inter processor broadcast can be active at any one
* time on the Merced bus. */
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index e391b175f5ec..80943a00e245 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -18,8 +18,7 @@
#include <linux/kthread.h>
#include <linux/initrd.h>
#include <linux/pgtable.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/mm.h>
#include <asm/pdc.h>
#include <asm/pdcpat.h>
@@ -232,7 +231,7 @@ void __init pdc_pdt_init(void)
/* mark memory page bad */
memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(addr >> PAGE_SHIFT);
}
}
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 96ef6a6b66e5..69c62933e952 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -424,8 +424,9 @@ static int fpr_set(struct task_struct *target,
ubuf = u;
pos *= sizeof(reg);
count *= sizeof(reg);
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- ELF_NFPREG * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ ELF_NFPREG * sizeof(reg), -1);
+ return 0;
}
#define RI(reg) (offsetof(struct user_regs_struct,reg) / sizeof(long))
@@ -543,8 +544,9 @@ static int gpr_set(struct task_struct *target,
ubuf = u;
pos *= sizeof(reg);
count *= sizeof(reg);
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- ELF_NGREG * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ ELF_NGREG * sizeof(reg), -1);
+ return 0;
}
static const struct user_regset native_regsets[] = {
@@ -606,8 +608,9 @@ static int gpr32_set(struct task_struct *target,
ubuf = u;
pos *= sizeof(reg);
count *= sizeof(reg);
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- ELF_NGREG * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ ELF_NGREG * sizeof(reg), -1);
+ return 0;
}
/*
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 283f40d05a4d..9972626ddaf6 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -81,13 +81,6 @@ void poking_init(void);
extern unsigned long ioremap_bot;
extern const pgprot_t protection_map[16];
-/*
- * kern_addr_valid is intended to indicate whether an address is a valid
- * kernel address. Most 32-bit archs define it as always true (like this)
- * but most 64-bit archs actually perform a test. What should we do here?
- */
-#define kern_addr_valid(addr) (1)
-
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_large(pmd) 0
#endif
diff --git a/arch/powerpc/kernel/ptrace/ptrace-tm.c b/arch/powerpc/kernel/ptrace/ptrace-tm.c
index 44045363a903..210ea834e603 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-tm.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-tm.c
@@ -170,9 +170,9 @@ int tm_cgpr_set(struct task_struct *target, const struct user_regset *regset,
(PT_MAX_PUT_REG + 1) * sizeof(reg));
if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- (PT_MAX_PUT_REG + 1) * sizeof(reg),
- PT_TRAP * sizeof(reg));
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ (PT_MAX_PUT_REG + 1) * sizeof(reg),
+ PT_TRAP * sizeof(reg));
if (!ret && count > 0) {
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
@@ -183,8 +183,8 @@ int tm_cgpr_set(struct task_struct *target, const struct user_regset *regset,
}
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- (PT_TRAP + 1) * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ (PT_TRAP + 1) * sizeof(reg), -1);
return ret;
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 076d867412c7..2087a785f05f 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -267,9 +267,9 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
(PT_MAX_PUT_REG + 1) * sizeof(reg));
if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- (PT_MAX_PUT_REG + 1) * sizeof(reg),
- PT_TRAP * sizeof(reg));
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ (PT_MAX_PUT_REG + 1) * sizeof(reg),
+ PT_TRAP * sizeof(reg));
if (!ret && count > 0) {
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &reg,
@@ -280,8 +280,8 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
}
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- (PT_TRAP + 1) * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ (PT_TRAP + 1) * sizeof(reg), -1);
return ret;
}
@@ -706,8 +706,9 @@ int gpr32_set_common(struct task_struct *target,
ubuf = u;
pos *= sizeof(reg);
count *= sizeof(reg);
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- (PT_TRAP + 1) * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ (PT_TRAP + 1) * sizeof(reg), -1);
+ return 0;
Efault:
user_read_access_end();
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 349a781cea0b..60e12b716d3c 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -35,7 +35,7 @@ struct umem_info {
/* usable memory ranges to look up */
unsigned int nr_ranges;
- const struct crash_mem_range *ranges;
+ const struct range *ranges;
};
const struct kexec_file_ops * const kexec_file_loaders[] = {
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
index 563e9989a5bf..5fc53a5fcfdf 100644
--- a/arch/powerpc/kexec/ranges.c
+++ b/arch/powerpc/kexec/ranges.c
@@ -33,7 +33,7 @@
static inline unsigned int get_max_nr_ranges(size_t size)
{
return ((size - sizeof(struct crash_mem)) /
- sizeof(struct crash_mem_range));
+ sizeof(struct range));
}
/**
@@ -51,7 +51,7 @@ static inline size_t get_mem_rngs_size(struct crash_mem *mem_rngs)
return 0;
size = (sizeof(struct crash_mem) +
- (mem_rngs->max_nr_ranges * sizeof(struct crash_mem_range)));
+ (mem_rngs->max_nr_ranges * sizeof(struct range)));
/*
* Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
@@ -98,7 +98,7 @@ static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
*/
static void __merge_memory_ranges(struct crash_mem *mem_rngs)
{
- struct crash_mem_range *ranges;
+ struct range *ranges;
int i, idx;
if (!mem_rngs)
@@ -123,7 +123,7 @@ static void __merge_memory_ranges(struct crash_mem *mem_rngs)
/* cmp_func_t callback to sort ranges with sort() */
static int rngcmp(const void *_x, const void *_y)
{
- const struct crash_mem_range *x = _x, *y = _y;
+ const struct range *x = _x, *y = _y;
if (x->start > y->start)
return 1;
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e2f11f9c3f2a..2ea59396f608 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -689,12 +689,14 @@ unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
*/
static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
{
- struct page *dpage = NULL;
+ struct dev_pagemap *pgmap = &kvmppc_uvmem_pgmap;
unsigned long bit, uvmem_pfn;
struct kvmppc_uvmem_page_pvt *pvt;
unsigned long pfn_last, pfn_first;
+ struct folio *folio;
+ struct page *dpage;
- pfn_first = kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT;
+ pfn_first = pgmap->range.start >> PAGE_SHIFT;
pfn_last = pfn_first +
(range_len(&kvmppc_uvmem_pgmap.range) >> PAGE_SHIFT);
@@ -716,9 +718,11 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
pvt->gpa = gpa;
pvt->kvm = kvm;
- dpage = pfn_to_page(uvmem_pfn);
+ folio = pgmap_request_folio(pgmap,
+ pfn_to_pgmap_offset(pgmap, uvmem_pfn), 0);
+ dpage = &folio->page;
dpage->zone_device_data = pvt;
- zone_device_page_init(dpage);
+ lock_page(dpage);
return dpage;
out_clear:
spin_lock(&kvmppc_uvmem_bitmap_lock);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 5852a86d990d..f1ba8d1e8c1a 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
} while (addr = next, addr != end);
}
-struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd,
- int flags, int pdshift)
-{
- pte_t *ptep;
- spinlock_t *ptl;
- struct page *page = NULL;
- unsigned long mask;
- int shift = hugepd_shift(hpd);
- struct mm_struct *mm = vma->vm_mm;
-
-retry:
- /*
- * hugepage directory entries are protected by mm->page_table_lock
- * Use this instead of huge_pte_lockptr
- */
- ptl = &mm->page_table_lock;
- spin_lock(ptl);
-
- ptep = hugepte_offset(hpd, address, pdshift);
- if (pte_present(*ptep)) {
- mask = (1UL << shift) - 1;
- page = pte_page(*ptep);
- page += ((address & mask) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
- } else {
- if (is_hugetlb_entry_migration(*ptep)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, ptep, ptl);
- goto retry;
- }
- }
- spin_unlock(ptl);
- return page;
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7ec936910a96..c7993bdf749f 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -801,8 +801,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#endif /* !CONFIG_MMU */
-#define kern_addr_valid(addr) (1) /* FIXME */
-
extern char _start[];
extern void *_dtb_early_va;
extern uintptr_t _dtb_early_pa;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index f1cb9391190d..e1db07211818 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1773,8 +1773,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
extern int vmem_add_mapping(unsigned long start, unsigned long size);
extern void vmem_remove_mapping(unsigned long start, unsigned long size);
extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 6fb9ec54cf9b..3ce30becf6df 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -92,8 +92,6 @@ static inline unsigned long phys_addr_mask(void)
typedef pte_t *pte_addr_t;
-#define kern_addr_valid(addr) (1)
-
#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
struct vm_area_struct;
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index d417988d9770..36f50ad81e83 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -157,8 +157,8 @@ static int genregs_set(struct task_struct *target,
offsetof(struct pt_regs, pc),
sizeof(struct pt_regs));
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- sizeof(struct pt_regs), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ sizeof(struct pt_regs), -1);
return ret;
}
@@ -229,8 +229,8 @@ static int dspregs_set(struct task_struct *target,
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs,
0, sizeof(struct pt_dspregs));
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- sizeof(struct pt_dspregs), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ sizeof(struct pt_dspregs), -1);
return ret;
}
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 8ff549004fac..5acc05b572e6 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -368,12 +368,6 @@ __get_iospace (unsigned long addr)
}
}
-extern unsigned long *sparc_valid_addr_bitmap;
-
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-#define kern_addr_valid(addr) \
- (test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap))
-
/*
* For sparc32&64, the pfn in io_remap_pfn_range() carries <iospace> in
* its high 4 bits. These macros/functions put it there or get it from there.
diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c
index e7db48acb838..c273ccebea46 100644
--- a/arch/sparc/kernel/ptrace_32.c
+++ b/arch/sparc/kernel/ptrace_32.c
@@ -158,8 +158,9 @@ static int genregs32_set(struct task_struct *target,
35 * sizeof(u32), 36 * sizeof(u32));
if (ret || !count)
return ret;
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 36 * sizeof(u32), 38 * sizeof(u32));
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 36 * sizeof(u32),
+ 38 * sizeof(u32));
+ return 0;
}
static int fpregs32_get(struct task_struct *target,
@@ -203,8 +204,8 @@ static int fpregs32_set(struct task_struct *target,
33 * sizeof(u32),
34 * sizeof(u32));
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 34 * sizeof(u32), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 34 * sizeof(u32), -1);
return ret;
}
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 86a7eb5c27ba..4deba5b6eddb 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -332,8 +332,8 @@ static int genregs64_set(struct task_struct *target,
}
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 36 * sizeof(u64), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 36 * sizeof(u64), -1);
return ret;
}
@@ -406,8 +406,8 @@ static int fpregs64_set(struct task_struct *target,
task_thread_info(target)->fpsaved[0] = fprs;
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 35 * sizeof(u64), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 35 * sizeof(u64), -1);
return ret;
}
@@ -473,10 +473,8 @@ static int setregs64_set(struct task_struct *target,
15 * sizeof(u64));
if (ret)
return ret;
- ret =user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 15 * sizeof(u64), 16 * sizeof(u64));
- if (ret)
- return ret;
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 15 * sizeof(u64), 16 * sizeof(u64));
/* TSTATE */
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&tstate,
@@ -670,8 +668,9 @@ finish:
pos *= sizeof(reg);
count *= sizeof(reg);
- return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 38 * sizeof(reg), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 38 * sizeof(reg), -1);
+ return 0;
}
static int fpregs32_get(struct task_struct *target,
@@ -737,8 +736,8 @@ static int fpregs32_set(struct task_struct *target,
task_thread_info(target)->fpsaved[0] = fprs;
if (!ret)
- ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
- 34 * sizeof(u32), -1);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+ 34 * sizeof(u32), -1);
return ret;
}
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index d88e774c8eb4..9c0ea457bdf0 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -37,8 +37,7 @@
#include "mm_32.h"
-unsigned long *sparc_valid_addr_bitmap;
-EXPORT_SYMBOL(sparc_valid_addr_bitmap);
+static unsigned long *sparc_valid_addr_bitmap;
unsigned long phys_base;
EXPORT_SYMBOL(phys_base);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d6faee23c77d..04f9db0c3111 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1667,7 +1667,6 @@ bool kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-EXPORT_SYMBOL(kern_addr_valid);
static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
unsigned long vend,
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 66bc3f99d9be..4e3052f2671a 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -298,8 +298,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
-#define kern_addr_valid(addr) (1)
-
/* Clear a kernel PTE and flush it from the TLB */
#define kpte_clear_flush(ptep, vaddr) \
do { \
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 7c9c968a42ef..7d4ad8907297 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -48,15 +48,6 @@ do { \
#endif /* !__ASSEMBLY__ */
/*
- * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM
- */
-#ifdef CONFIG_FLATMEM
-#define kern_addr_valid(addr) (1)
-#else
-#define kern_addr_valid(kaddr) (0)
-#endif
-
-/*
* This is used to calculate the .brk reservation for initial pagetables.
* Enough space is reserved to allocate pagetables sufficient to cover all
* of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e479491da8d5..7929327abe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
-extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 1ec20807de1e..6225c525372d 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
unsigned long addr,
unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
@@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma)
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..e8db4edd7cc9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1416,47 +1416,6 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-int kern_addr_valid(unsigned long addr)
-{
- unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (above != 0 && above != -1UL)
- return 0;
-
- pgd = pgd_offset_k(addr);
- if (pgd_none(*pgd))
- return 0;
-
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
- return 0;
-
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
- return 0;
-
- if (pud_large(*pud))
- return pfn_valid(pud_pfn(*pud));
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return pfn_valid(pmd_pfn(*pmd));
-
- pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte))
- return 0;
-
- return pfn_valid(pte_pfn(*pte));
-}
-
/*
* Block size is the minimum amount of memory which can be hotplugged or
* hotremoved. It must be power of two and must be equal or larger than
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 54f577c13afa..5b5484d707b2 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -386,8 +386,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
#else
-#define kern_addr_valid(addr) (1)
-
extern void update_mmu_cache(struct vm_area_struct * vma,
unsigned long address, pte_t *ptep);
diff --git a/drivers/Makefile b/drivers/Makefile
index bdf1c66141c9..9beeee520073 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
obj-$(CONFIG_PARPORT) += parport/
obj-y += base/ block/ misc/ mfd/ nfc/
obj-$(CONFIG_LIBNVDIMM) += nvdimm/
-obj-$(CONFIG_DAX) += dax/
+obj-y += dax/
obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
obj-$(CONFIG_NUBUS) += nubus/
obj-y += cxl/
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 23f49a2f4d14..139e3b41653e 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -767,11 +767,6 @@ static int hmat_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block hmat_callback_nb = {
- .notifier_call = hmat_callback,
- .priority = 2,
-};
-
static __init void hmat_free_structures(void)
{
struct memory_target *target, *tnext;
@@ -854,7 +849,7 @@ static __init int hmat_init(void)
hmat_register_targets();
/* Keep the table and structures if the notifier may use them */
- if (!register_hotmemory_notifier(&hmat_callback_nb))
+ if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI))
return 0;
out_put:
hmat_free_structures();
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 9aa0da991cfb..fe98fb8d94e5 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v)
return blocking_notifier_call_chain(&memory_chain, val, v);
}
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+static unsigned long memblk_nr_poison(struct memory_block *mem);
+#else
+static inline unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return 0;
+}
+#endif
+
static int memory_block_online(struct memory_block *mem)
{
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
@@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem)
struct zone *zone;
int ret;
+ if (memblk_nr_poison(mem))
+ return -EHWPOISON;
+
zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
start_pfn, nr_pages);
@@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
+ num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
unregister_memory_block_under_nodes(mem);
remove_memory_block(mem);
}
@@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
}
return ret;
}
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+void memblk_nr_poison_inc(unsigned long pfn)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_inc(&mem->nr_hwpoison);
+}
+
+void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+ const unsigned long block_id = pfn_to_block_id(pfn);
+ struct memory_block *mem = find_memory_block_by_id(block_id);
+
+ if (mem)
+ atomic_long_sub(i, &mem->nr_hwpoison);
+}
+
+static unsigned long memblk_nr_poison(struct memory_block *mem)
+{
+ return atomic_long_read(&mem->nr_hwpoison);
+}
+#endif
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index d4100b0c083e..076a76cd1664 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -78,3 +78,58 @@ config ZRAM_MEMORY_TRACKING
/sys/kernel/debug/zram/zramX/block_state.
See Documentation/admin-guide/blockdev/zram.rst for more information.
+
+config ZRAM_MULTI_COMP
+ bool "Enable multiple per-CPU compression streams"
+ depends on ZRAM
+ help
+ This will enable per-CPU multi-compression streams, so that ZRAM
+ can re-compress IDLE/huge pages, using a potentially slower but
+ more effective compression algorithm. Note, that IDLE page support
+ requires ZRAM_MEMORY_TRACKING.
+
+ echo TIMEOUT > /sys/block/zramX/idle
+ echo SIZE > /sys/block/zramX/recompress
+
+ SIZE (in bytes) parameter sets the object size watermark: idle
+ objects that are of a smaller size will not get recompressed.
+
+choice
+ prompt "Default zram recompression algorithm"
+ default ZRAM_DEF_RECOMP_ZSTD
+ depends on ZRAM && ZRAM_MULTI_COMP
+
+config ZRAM_DEF_RECOMP_LZORLE
+ bool "lzo-rle"
+ depends on CRYPTO_LZO
+
+config ZRAM_DEF_RECOMP_ZSTD
+ bool "zstd"
+ depends on CRYPTO_ZSTD
+
+config ZRAM_DEF_RECOMP_LZ4
+ bool "lz4"
+ depends on CRYPTO_LZ4
+
+config ZRAM_DEF_RECOMP_LZO
+ bool "lzo"
+ depends on CRYPTO_LZO
+
+config ZRAM_DEF_RECOMP_LZ4HC
+ bool "lz4hc"
+ depends on CRYPTO_LZ4HC
+
+config ZRAM_DEF_RECOMP_842
+ bool "842"
+ depends on CRYPTO_842
+
+endchoice
+
+config ZRAM_DEF_RECOMP
+ string
+ default "lzo-rle" if ZRAM_DEF_RECOMP_LZORLE
+ default "zstd" if ZRAM_DEF_RECOMP_ZSTD
+ default "lz4" if ZRAM_DEF_RECOMP_LZ4
+ default "lzo" if ZRAM_DEF_RECOMP_LZO
+ default "lz4hc" if ZRAM_DEF_RECOMP_LZ4HC
+ default "842" if ZRAM_DEF_RECOMP_842
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 0916de952e09..55af4efd7983 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -206,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
* case of allocation error, or any other error potentially
* returned by zcomp_init().
*/
-struct zcomp *zcomp_create(const char *compress)
+struct zcomp *zcomp_create(const char *alg)
{
struct zcomp *comp;
int error;
@@ -216,14 +216,14 @@ struct zcomp *zcomp_create(const char *compress)
* is not loaded yet. We must do it here, otherwise we are about to
* call /sbin/modprobe under CPU hot-plug lock.
*/
- if (!zcomp_available_algorithm(compress))
+ if (!zcomp_available_algorithm(alg))
return ERR_PTR(-EINVAL);
comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
if (!comp)
return ERR_PTR(-ENOMEM);
- comp->name = compress;
+ comp->name = alg;
error = zcomp_init(comp);
if (error) {
kfree(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 40f6420f4b2e..cdefdef93da8 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -27,7 +27,7 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
ssize_t zcomp_available_show(const char *comp, char *buf);
bool zcomp_available_algorithm(const char *comp);
-struct zcomp *zcomp_create(const char *comp);
+struct zcomp *zcomp_create(const char *alg);
void zcomp_destroy(struct zcomp *comp);
struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 966aab902d19..364323713393 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -41,7 +41,12 @@ static DEFINE_IDR(zram_index_idr);
static DEFINE_MUTEX(zram_index_mutex);
static int zram_major;
-static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
+static const char *default_comp_algs[ZRAM_MAX_ZCOMPS] = {
+ CONFIG_ZRAM_DEF_COMP,
+#ifdef CONFIG_ZRAM_MULTI_COMP
+ CONFIG_ZRAM_DEF_RECOMP,
+#endif
+};
/* Module params (documentation at end) */
static unsigned int num_devices = 1;
@@ -188,16 +193,13 @@ static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
static inline void update_used_max(struct zram *zram,
const unsigned long pages)
{
- unsigned long old_max, cur_max;
-
- old_max = atomic_long_read(&zram->stats.max_used_pages);
+ unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
do {
- cur_max = old_max;
- if (pages > cur_max)
- old_max = atomic_long_cmpxchg(
- &zram->stats.max_used_pages, cur_max, pages);
- } while (old_max != cur_max);
+ if (cur_max >= pages)
+ return;
+ } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
+ &cur_max, pages));
}
static inline void zram_fill_page(void *ptr, unsigned long len,
@@ -753,8 +755,12 @@ static ssize_t writeback_store(struct device *dev,
zram_clear_flag(zram, index, ZRAM_IDLE);
zram_slot_unlock(zram, index);
/*
- * Return last IO error unless every IO were
- * not suceeded.
+ * BIO errors are not fatal, we continue and simply
+ * attempt to writeback the remaining objects (pages).
+ * At the same time we need to signal user-space that
+ * some writes (at least one, but also could be all of
+ * them) were not successful and we do so by returning
+ * the most recent BIO error.
*/
ret = err;
continue;
@@ -920,13 +926,14 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
ts = ktime_to_timespec64(zram->table[index].ac_time);
copied = snprintf(kbuf + written, count,
- "%12zd %12lld.%06lu %c%c%c%c\n",
+ "%12zd %12lld.%06lu %c%c%c%c%c\n",
index, (s64)ts.tv_sec,
ts.tv_nsec / NSEC_PER_USEC,
zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
- zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
+ zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
+ zram_test_flag(zram, index, ZRAM_RECOMP) ? 'r' : '.');
if (count <= copied) {
zram_slot_unlock(zram, index);
@@ -1000,47 +1007,114 @@ static ssize_t max_comp_streams_store(struct device *dev,
return len;
}
-static ssize_t comp_algorithm_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static void comp_algorithm_set(struct zram *zram, u32 idx, const char *alg)
{
- size_t sz;
- struct zram *zram = dev_to_zram(dev);
+ bool default_alg = false;
+ int i;
+
+ /* Do not kfree() algs that we didn't allocate, IOW the default ones */
+ for (i = 0; i < ZRAM_MAX_ZCOMPS; i++) {
+ if (zram->comp_algs[idx] == default_comp_algs[i]) {
+ default_alg = true;
+ break;
+ }
+ }
+
+ if (!default_alg)
+ kfree(zram->comp_algs[idx]);
+ zram->comp_algs[idx] = alg;
+}
+
+static ssize_t __comp_algorithm_show(struct zram *zram, u32 idx, char *buf)
+{
+ ssize_t sz;
down_read(&zram->init_lock);
- sz = zcomp_available_show(zram->compressor, buf);
+ sz = zcomp_available_show(zram->comp_algs[idx], buf);
up_read(&zram->init_lock);
return sz;
}
-static ssize_t comp_algorithm_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+static int __comp_algorithm_store(struct zram *zram, u32 idx, const char *buf)
{
- struct zram *zram = dev_to_zram(dev);
- char compressor[ARRAY_SIZE(zram->compressor)];
+ char *compressor;
size_t sz;
- strscpy(compressor, buf, sizeof(compressor));
+ sz = strlen(buf);
+ if (sz >= CRYPTO_MAX_ALG_NAME)
+ return -E2BIG;
+
+ compressor = kstrdup(buf, GFP_KERNEL);
+ if (!compressor)
+ return -ENOMEM;
+
/* ignore trailing newline */
- sz = strlen(compressor);
if (sz > 0 && compressor[sz - 1] == '\n')
compressor[sz - 1] = 0x00;
- if (!zcomp_available_algorithm(compressor))
+ if (!zcomp_available_algorithm(compressor)) {
+ kfree(compressor);
return -EINVAL;
+ }
down_write(&zram->init_lock);
if (init_done(zram)) {
up_write(&zram->init_lock);
+ kfree(compressor);
pr_info("Can't change algorithm for initialized device\n");
return -EBUSY;
}
- strcpy(zram->compressor, compressor);
+ comp_algorithm_set(zram, idx, compressor);
up_write(&zram->init_lock);
- return len;
+ return 0;
}
+static ssize_t comp_algorithm_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+
+ return __comp_algorithm_show(zram, ZRAM_PRIMARY_ZCOMP, buf);
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ int ret;
+
+ ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_ZCOMP, buf);
+ return ret ? ret : len;
+}
+
+#ifdef CONFIG_ZRAM_MULTI_COMP
+static ssize_t recomp_algorithm_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+
+ return __comp_algorithm_show(zram, ZRAM_SECONDARY_ZCOMP, buf);
+}
+
+static ssize_t recomp_algorithm_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ int ret;
+
+ ret = __comp_algorithm_store(zram, ZRAM_SECONDARY_ZCOMP, buf);
+ return ret ? ret : len;
+}
+#endif
+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -1210,6 +1284,12 @@ static void zram_free_page(struct zram *zram, size_t index)
atomic64_dec(&zram->stats.huge_pages);
}
+ if (zram_test_flag(zram, index, ZRAM_RECOMP))
+ zram_clear_flag(zram, index, ZRAM_RECOMP);
+
+ if (zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
+ zram_clear_flag(zram, index, ZRAM_RECOMP_SKIP);
+
if (zram_test_flag(zram, index, ZRAM_WB)) {
zram_clear_flag(zram, index, ZRAM_WB);
free_block_bdev(zram, zram_get_element(zram, index));
@@ -1242,32 +1322,38 @@ out:
~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
}
-static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
- struct bio *bio, bool partial_io)
+/*
+ * Reads a page from the writeback devices. Corresponding ZRAM slot
+ * should be unlocked.
+ */
+static int zram_read_from_writeback(struct zram *zram, struct page *page,
+ u32 index, struct bio *bio,
+ bool partial_io)
+{
+ struct bio_vec bvec;
+
+ bvec.bv_page = page;
+ bvec.bv_len = PAGE_SIZE;
+ bvec.bv_offset = 0;
+ return read_from_bdev(zram, &bvec,
+ zram_get_element(zram, index),
+ bio, partial_io);
+}
+
+/*
+ * Reads (decompresses if needed) a page from zspool (zsmalloc).
+ * Corresponding ZRAM slot should be locked.
+ */
+static int zram_read_from_zspool(struct zram *zram, struct page *page,
+ u32 index)
{
struct zcomp_strm *zstrm;
unsigned long handle;
unsigned int size;
void *src, *dst;
+ u32 idx;
int ret;
- zram_slot_lock(zram, index);
- if (zram_test_flag(zram, index, ZRAM_WB)) {
- struct bio_vec bvec;
-
- zram_slot_unlock(zram, index);
- /* A null bio means rw_page was used, we must fallback to bio */
- if (!bio)
- return -EOPNOTSUPP;
-
- bvec.bv_page = page;
- bvec.bv_len = PAGE_SIZE;
- bvec.bv_offset = 0;
- return read_from_bdev(zram, &bvec,
- zram_get_element(zram, index),
- bio, partial_io);
- }
-
handle = zram_get_handle(zram, index);
if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
unsigned long value;
@@ -1277,14 +1363,18 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
mem = kmap_atomic(page);
zram_fill_page(mem, PAGE_SIZE, value);
kunmap_atomic(mem);
- zram_slot_unlock(zram, index);
return 0;
}
size = zram_get_obj_size(zram, index);
- if (size != PAGE_SIZE)
- zstrm = zcomp_stream_get(zram->comp);
+ if (size != PAGE_SIZE) {
+ idx = ZRAM_PRIMARY_ZCOMP;
+ if (zram_test_flag(zram, index, ZRAM_RECOMP))
+ idx = ZRAM_SECONDARY_ZCOMP;
+
+ zstrm = zcomp_stream_get(zram->comps[idx]);
+ }
src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
if (size == PAGE_SIZE) {
@@ -1296,20 +1386,43 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
dst = kmap_atomic(page);
ret = zcomp_decompress(zstrm, src, size, dst);
kunmap_atomic(dst);
- zcomp_stream_put(zram->comp);
+ zcomp_stream_put(zram->comps[idx]);
}
zs_unmap_object(zram->mem_pool, handle);
- zram_slot_unlock(zram, index);
+ return ret;
+}
+
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+ struct bio *bio, bool partial_io)
+{
+ int ret;
+
+ zram_slot_lock(zram, index);
+ if (!zram_test_flag(zram, index, ZRAM_WB)) {
+ /* Slot should be locked through out the function call */
+ ret = zram_read_from_zspool(zram, page, index);
+ zram_slot_unlock(zram, index);
+ } else {
+ /* Slot should be unlocked before the function call */
+ zram_slot_unlock(zram, index);
+
+ /* A null bio means rw_page was used, we must fallback to bio */
+ if (!bio)
+ return -EOPNOTSUPP;
+
+ ret = zram_read_from_writeback(zram, page, index, bio,
+ partial_io);
+ }
/* Should NEVER happen. Return bio error if it does. */
- if (WARN_ON(ret))
+ if (WARN_ON(ret < 0))
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
return ret;
}
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
- u32 index, int offset, struct bio *bio)
+ u32 index, int offset, struct bio *bio)
{
int ret;
struct page *page;
@@ -1363,13 +1476,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
kunmap_atomic(mem);
compress_again:
- zstrm = zcomp_stream_get(zram->comp);
+ zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_ZCOMP]);
src = kmap_atomic(page);
ret = zcomp_compress(zstrm, src, &comp_len);
kunmap_atomic(src);
if (unlikely(ret)) {
- zcomp_stream_put(zram->comp);
+ zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
pr_err("Compression failed! err=%d\n", ret);
zs_free(zram->mem_pool, handle);
return ret;
@@ -1390,19 +1503,19 @@ compress_again:
* if we have a 'non-null' handle here then we are coming
* from the slow path and handle has already been allocated.
*/
- if (IS_ERR((void *)handle))
+ if (IS_ERR_VALUE(handle))
handle = zs_malloc(zram->mem_pool, comp_len,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
- if (IS_ERR((void *)handle)) {
- zcomp_stream_put(zram->comp);
+ if (IS_ERR_VALUE(handle)) {
+ zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
atomic64_inc(&zram->stats.writestall);
handle = zs_malloc(zram->mem_pool, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
- if (IS_ERR((void *)handle))
+ if (IS_ERR_VALUE(handle))
return PTR_ERR((void *)handle);
if (comp_len != PAGE_SIZE)
@@ -1414,14 +1527,14 @@ compress_again:
* zstrm buffer back. It is necessary that the dereferencing
* of the zstrm variable below occurs correctly.
*/
- zstrm = zcomp_stream_get(zram->comp);
+ zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_ZCOMP]);
}
alloced_pages = zs_get_total_pages(zram->mem_pool);
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zcomp_stream_put(zram->comp);
+ zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
zs_free(zram->mem_pool, handle);
return -ENOMEM;
}
@@ -1435,7 +1548,7 @@ compress_again:
if (comp_len == PAGE_SIZE)
kunmap_atomic(src);
- zcomp_stream_put(zram->comp);
+ zcomp_stream_put(zram->comps[ZRAM_PRIMARY_ZCOMP]);
zs_unmap_object(zram->mem_pool, handle);
atomic64_add(comp_len, &zram->stats.compr_data_size);
out:
@@ -1504,6 +1617,182 @@ out:
return ret;
}
+#ifdef CONFIG_ZRAM_MULTI_COMP
+/*
+ * This function will decompress (unless it's ZRAM_HUGE) the page and then
+ * attempt to compress it using secondary compression algorithm (which is
+ * potentially more effective).
+ *
+ * Corresponding ZRAM slot should be locked.
+ */
+static int zram_recompress(struct zram *zram, u32 index, struct page *page,
+ int size_watermark)
+{
+ unsigned long handle_prev;
+ unsigned long handle_next;
+ unsigned int comp_len_next;
+ unsigned int comp_len_prev;
+ struct zcomp_strm *zstrm;
+ void *src, *dst;
+ int ret;
+
+ handle_prev = zram_get_handle(zram, index);
+ if (!handle_prev)
+ return -EINVAL;
+
+ comp_len_prev = zram_get_obj_size(zram, index);
+ /*
+ * Do not recompress objects that are already "small enough".
+ */
+ if (comp_len_prev < size_watermark)
+ return 0;
+
+ ret = zram_read_from_zspool(zram, page, index);
+ if (ret)
+ return ret;
+
+ zstrm = zcomp_stream_get(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+ src = kmap_atomic(page);
+ ret = zcomp_compress(zstrm, src, &comp_len_next);
+ kunmap_atomic(src);
+
+ /*
+ * Either a compression error or we failed to compressed the object
+ * in a way that will save us memory. Mark the object so that we
+ * don't attempt to re-compress it again (RECOMP_SKIP).
+ */
+ if (comp_len_next >= huge_class_size ||
+ comp_len_next >= comp_len_prev ||
+ ret) {
+ zram_set_flag(zram, index, ZRAM_RECOMP_SKIP);
+ zram_clear_flag(zram, index, ZRAM_IDLE);
+ zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+ return ret;
+ }
+
+ /*
+ * No direct reclaim (slow path) for handle allocation and no
+ * re-compression attempt (unlike in __zram_bvec_write()) since
+ * we already have stored that object in zsmalloc. If we cannot
+ * alloc memory for recompressed object then we bail out and
+ * simply keep the old (existing) object in zsmalloc.
+ */
+ handle_next = zs_malloc(zram->mem_pool, comp_len_next,
+ __GFP_KSWAPD_RECLAIM |
+ __GFP_NOWARN |
+ __GFP_HIGHMEM |
+ __GFP_MOVABLE);
+ if (IS_ERR_VALUE(handle_next)) {
+ zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+ return PTR_ERR((void *)handle_next);
+ }
+
+ dst = zs_map_object(zram->mem_pool, handle_next, ZS_MM_WO);
+ memcpy(dst, zstrm->buffer, comp_len_next);
+ zcomp_stream_put(zram->comps[ZRAM_SECONDARY_ZCOMP]);
+
+ zs_unmap_object(zram->mem_pool, handle_next);
+
+ zram_free_page(zram, index);
+ zram_set_handle(zram, index, handle_next);
+ zram_set_obj_size(zram, index, comp_len_next);
+
+ zram_set_flag(zram, index, ZRAM_RECOMP);
+ atomic64_add(comp_len_next, &zram->stats.compr_data_size);
+ atomic64_inc(&zram->stats.pages_stored);
+
+ return 0;
+}
+
+#define RECOMPRESS_IDLE (1 << 0)
+#define RECOMPRESS_HUGE (1 << 1)
+
+static ssize_t recompress_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+ unsigned long index;
+ struct page *page;
+ ssize_t ret;
+ int mode, size_watermark = 0;
+
+ if (sysfs_streq(buf, "idle")) {
+ mode = RECOMPRESS_IDLE;
+ } else if (sysfs_streq(buf, "huge")) {
+ mode = RECOMPRESS_HUGE;
+ } else if (sysfs_streq(buf, "huge_idle")) {
+ mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
+ } else {
+ /*
+ * We will re-compress only idle objects equal or greater
+ * in size than watermark.
+ */
+ ret = kstrtoint(buf, 10, &size_watermark);
+ if (ret)
+ return ret;
+ mode = RECOMPRESS_IDLE;
+ }
+
+ if (size_watermark > PAGE_SIZE)
+ return -EINVAL;
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ ret = -EINVAL;
+ goto release_init_lock;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ ret = len;
+ for (index = 0; index < nr_pages; index++) {
+ int err = 0;
+
+ zram_slot_lock(zram, index);
+
+ if (!zram_allocated(zram, index))
+ goto next;
+
+ if (mode & RECOMPRESS_IDLE &&
+ !zram_test_flag(zram, index, ZRAM_IDLE))
+ goto next;
+
+ if (mode & RECOMPRESS_HUGE &&
+ !zram_test_flag(zram, index, ZRAM_HUGE))
+ goto next;
+
+ if (zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME) ||
+ zram_test_flag(zram, index, ZRAM_RECOMP) ||
+ zram_test_flag(zram, index, ZRAM_RECOMP_SKIP))
+ goto next;
+
+ err = zram_recompress(zram, index, page, size_watermark);
+next:
+ zram_slot_unlock(zram, index);
+ if (err) {
+ ret = err;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ __free_page(page);
+
+release_init_lock:
+ up_read(&zram->init_lock);
+ return ret;
+}
+#endif
+
/*
* zram_bio_discard - handler on discard request
* @index: physical block index in PAGE_SIZE units
@@ -1710,6 +1999,20 @@ out:
return ret;
}
+static void zram_destroy_comps(struct zram *zram)
+{
+ u32 idx;
+
+ for (idx = 0; idx < ZRAM_MAX_ZCOMPS; idx++) {
+ struct zcomp *comp = zram->comps[idx];
+
+ zram->comps[idx] = NULL;
+ if (IS_ERR_OR_NULL(comp))
+ continue;
+ zcomp_destroy(comp);
+ }
+}
+
static void zram_reset_device(struct zram *zram)
{
down_write(&zram->init_lock);
@@ -1727,11 +2030,15 @@ static void zram_reset_device(struct zram *zram)
/* I/O operation under all of CPU are done so let's free */
zram_meta_free(zram, zram->disksize);
zram->disksize = 0;
+ zram_destroy_comps(zram);
memset(&zram->stats, 0, sizeof(zram->stats));
- zcomp_destroy(zram->comp);
- zram->comp = NULL;
reset_bdev(zram);
+ comp_algorithm_set(zram, ZRAM_PRIMARY_ZCOMP,
+ default_comp_algs[ZRAM_PRIMARY_ZCOMP]);
+ if (IS_ENABLED(CONFIG_ZRAM_MULTI_COMP))
+ comp_algorithm_set(zram, ZRAM_SECONDARY_ZCOMP,
+ default_comp_algs[ZRAM_SECONDARY_ZCOMP]);
up_write(&zram->init_lock);
}
@@ -1742,6 +2049,7 @@ static ssize_t disksize_store(struct device *dev,
struct zcomp *comp;
struct zram *zram = dev_to_zram(dev);
int err;
+ u32 idx;
disksize = memparse(buf, NULL);
if (!disksize)
@@ -1760,22 +2068,25 @@ static ssize_t disksize_store(struct device *dev,
goto out_unlock;
}
- comp = zcomp_create(zram->compressor);
- if (IS_ERR(comp)) {
- pr_err("Cannot initialise %s compressing backend\n",
- zram->compressor);
- err = PTR_ERR(comp);
- goto out_free_meta;
- }
+ for (idx = 0; idx < ZRAM_MAX_ZCOMPS; idx++) {
+ comp = zcomp_create(zram->comp_algs[idx]);
+ if (IS_ERR(comp)) {
+ pr_err("Cannot initialise %s compressing backend\n",
+ zram->comp_algs[idx]);
+ err = PTR_ERR(comp);
+ goto out_free_comps;
+ }
- zram->comp = comp;
+ zram->comps[idx] = comp;
+ }
zram->disksize = disksize;
set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
up_write(&zram->init_lock);
return len;
-out_free_meta:
+out_free_comps:
+ zram_destroy_comps(zram);
zram_meta_free(zram, disksize);
out_unlock:
up_write(&zram->init_lock);
@@ -1860,6 +2171,10 @@ static DEVICE_ATTR_WO(writeback);
static DEVICE_ATTR_RW(writeback_limit);
static DEVICE_ATTR_RW(writeback_limit_enable);
#endif
+#ifdef CONFIG_ZRAM_MULTI_COMP
+static DEVICE_ATTR_RW(recomp_algorithm);
+static DEVICE_ATTR_WO(recompress);
+#endif
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1883,6 +2198,10 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_bd_stat.attr,
#endif
&dev_attr_debug_stat.attr,
+#ifdef CONFIG_ZRAM_MULTI_COMP
+ &dev_attr_recomp_algorithm.attr,
+ &dev_attr_recompress.attr,
+#endif
NULL,
};
@@ -1962,7 +2281,11 @@ static int zram_add(void)
if (ret)
goto out_cleanup_disk;
- strscpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+ zram->comp_algs[ZRAM_PRIMARY_ZCOMP] =
+ default_comp_algs[ZRAM_PRIMARY_ZCOMP];
+ if (IS_ENABLED(CONFIG_ZRAM_MULTI_COMP))
+ zram->comp_algs[ZRAM_SECONDARY_ZCOMP] =
+ default_comp_algs[ZRAM_SECONDARY_ZCOMP];
zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a2bda53020fd..09b9ceb5dfa3 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -49,6 +49,8 @@ enum zram_pageflags {
ZRAM_UNDER_WB, /* page is under writeback */
ZRAM_HUGE, /* Incompressible page */
ZRAM_IDLE, /* not accessed page since last idle marking */
+ ZRAM_RECOMP, /* page was recompressed */
+ ZRAM_RECOMP_SKIP, /* secondary algorithm cannot compress this page */
__NR_ZRAM_PAGEFLAGS,
};
@@ -89,10 +91,20 @@ struct zram_stats {
#endif
};
+#ifdef CONFIG_ZRAM_MULTI_COMP
+#define ZRAM_PRIMARY_ZCOMP 0
+#define ZRAM_SECONDARY_ZCOMP 1
+#define ZRAM_MAX_ZCOMPS 2
+#else
+#define ZRAM_PRIMARY_ZCOMP 0
+#define ZRAM_SECONDARY_ZCOMP 0
+#define ZRAM_MAX_ZCOMPS 1
+#endif
+
struct zram {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
- struct zcomp *comp;
+ struct zcomp *comps[ZRAM_MAX_ZCOMPS];
struct gendisk *disk;
/* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
@@ -107,7 +119,7 @@ struct zram {
* we can store in a disk.
*/
u64 disksize; /* bytes */
- char compressor[CRYPTO_MAX_ALG_NAME];
+ const char *comp_algs[ZRAM_MAX_ZCOMPS];
/*
* zram is claimed so open request will be failed
*/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 5fdf269a822e..2eddd32c51f4 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,14 +1,15 @@
# SPDX-License-Identifier: GPL-2.0-only
menuconfig DAX
- tristate "DAX: direct access to differentiated memory"
+ bool "DAX: direct access to differentiated memory"
+ depends on MMU
select SRCU
- default m if NVDIMM_DAX
if DAX
config DEV_DAX
tristate "Device DAX: direct access mapping device"
depends on TRANSPARENT_HUGEPAGE
+ depends on !FS_DAX_LIMITED
help
Support raw access to differentiated (persistence, bandwidth,
latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 90a56ca3b345..3546bca7adbf 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
dax-y := super.o
dax-y += bus.o
+dax-y += mapping.o
device_dax-y := device.o
dax_pmem-y := pmem.o
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1dad813ee4a6..f2a8b8c3776f 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -382,9 +382,16 @@ void kill_dev_dax(struct dev_dax *dev_dax)
{
struct dax_device *dax_dev = dev_dax->dax_dev;
struct inode *inode = dax_inode(dax_dev);
+ struct address_space *mapping = inode->i_mapping;
kill_dax(dax_dev);
- unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+
+ /*
+ * The dax device inode can outlive the next reuse of the memory
+ * fronted by this device, force it idle now.
+ */
+ dax_break_layouts(mapping, 0, ULONG_MAX >> PAGE_SHIFT);
+ truncate_inode_pages(mapping, 0);
/*
* Dynamic dax region have the pgmap allocated via dev_kzalloc()
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 1c974b7caae6..19076f9d5c51 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -15,6 +15,7 @@ struct dax_device *inode_dax(struct inode *inode);
struct inode *dax_inode(struct dax_device *dax_dev);
int dax_bus_init(void);
void dax_bus_exit(void);
+void dax_mapping_init(void);
/**
* struct dax_region - mapping infrastructure for dax devices
@@ -87,6 +88,7 @@ static inline struct dax_mapping *to_dax_mapping(struct device *dev)
}
phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, unsigned long size);
+int dev_dax_probe(struct dev_dax *dev_dax);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline bool dax_align_valid(unsigned long align)
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 5494d745ced5..022d4ba9c336 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -73,38 +73,15 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
return -1;
}
-static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
- unsigned long fault_size)
-{
- unsigned long i, nr_pages = fault_size / PAGE_SIZE;
- struct file *filp = vmf->vma->vm_file;
- struct dev_dax *dev_dax = filp->private_data;
- pgoff_t pgoff;
-
- /* mapping is only set on the head */
- if (dev_dax->pgmap->vmemmap_shift)
- nr_pages = 1;
-
- pgoff = linear_page_index(vmf->vma,
- ALIGN(vmf->address, fault_size));
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
-
- page = compound_head(page);
- if (page->mapping)
- continue;
-
- page->mapping = filp->f_mapping;
- page->index = pgoff + i;
- }
-}
-
static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
+ vm_fault_t ret;
+ void *entry;
pfn_t pfn;
unsigned int fault_size = PAGE_SIZE;
@@ -128,7 +105,16 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- dax_set_mapping(vmf, pfn, fault_size);
+ entry = dax_grab_mapping_entry(&xas, mapping, 0);
+ if (is_dax_err(entry))
+ return dax_err_to_vmfault(entry);
+
+ ret = dax_insert_entry(&xas, vmf, &entry, pfn, 0);
+
+ dax_unlock_entry(&xas, entry);
+
+ if (ret)
+ return ret;
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
}
@@ -136,10 +122,14 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
+ vm_fault_t ret;
pgoff_t pgoff;
+ void *entry;
pfn_t pfn;
unsigned int fault_size = PMD_SIZE;
@@ -171,7 +161,16 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- dax_set_mapping(vmf, pfn, fault_size);
+ entry = dax_grab_mapping_entry(&xas, mapping, PMD_ORDER);
+ if (is_dax_err(entry))
+ return dax_err_to_vmfault(entry);
+
+ ret = dax_insert_entry(&xas, vmf, &entry, pfn, DAX_PMD);
+
+ dax_unlock_entry(&xas, entry);
+
+ if (ret)
+ return ret;
return vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
@@ -180,10 +179,14 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
struct vm_fault *vmf)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pud_addr = vmf->address & PUD_MASK;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct device *dev = &dev_dax->dev;
phys_addr_t phys;
+ vm_fault_t ret;
pgoff_t pgoff;
+ void *entry;
pfn_t pfn;
unsigned int fault_size = PUD_SIZE;
@@ -216,7 +219,16 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
- dax_set_mapping(vmf, pfn, fault_size);
+ entry = dax_grab_mapping_entry(&xas, mapping, PUD_ORDER);
+ if (xa_is_internal(entry))
+ return xa_to_internal(entry);
+
+ ret = dax_insert_entry(&xas, vmf, &entry, pfn, DAX_PUD);
+
+ dax_unlock_entry(&xas, entry);
+
+ if (ret)
+ return ret;
return vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
}
@@ -494,3 +506,4 @@ MODULE_LICENSE("GPL v2");
module_init(dax_init);
module_exit(dax_exit);
MODULE_ALIAS_DAX_DEVICE(0);
+MODULE_IMPORT_NS(DAX);
diff --git a/drivers/dax/mapping.c b/drivers/dax/mapping.c
new file mode 100644
index 000000000000..b885c75e2dfb
--- /dev/null
+++ b/drivers/dax/mapping.c
@@ -0,0 +1,1089 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Direct Access mapping infrastructure split from fs/dax.c
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/rmap.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+#include <linux/pagemap.h>
+#include <linux/huge_mm.h>
+
+#include "dax-private.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+void __init dax_mapping_init(void)
+{
+ int i;
+
+ for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+ init_waitqueue_head(wait_table + i);
+}
+
+static unsigned long dax_to_pfn(void *entry)
+{
+ return xa_to_value(entry) >> DAX_SHIFT;
+}
+
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+{
+ return xa_mk_value((flags & DAX_MASK) |
+ (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+}
+
+static bool dax_is_locked(void *entry)
+{
+ return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static bool dax_is_zapped(void *entry)
+{
+ return xa_to_value(entry) & DAX_ZAP;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+ if (xa_to_value(entry) & DAX_PUD)
+ return PUD_ORDER;
+ if (xa_to_value(entry) & DAX_PMD)
+ return PMD_ORDER;
+ return 0;
+}
+
+static unsigned long dax_is_pmd_entry(void *entry)
+{
+ return xa_to_value(entry) & DAX_PMD;
+}
+
+static unsigned long dax_is_pud_entry(void *entry)
+{
+ return xa_to_value(entry) & DAX_PUD;
+}
+
+static bool dax_is_pte_entry(void *entry)
+{
+ return !(xa_to_value(entry) & (DAX_PMD|DAX_PUD));
+}
+
+static int dax_is_zero_entry(void *entry)
+{
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
+}
+
+static int dax_is_empty_entry(void *entry)
+{
+ return xa_to_value(entry) & DAX_EMPTY;
+}
+
+/*
+ * true if the entry that was found is of a smaller order than the entry
+ * we were looking for
+ */
+static bool dax_is_conflict(void *entry)
+{
+ return entry == XA_RETRY_ENTRY;
+}
+
+/*
+ * DAX page cache entry locking
+ */
+struct exceptional_entry_key {
+ struct xarray *xa;
+ pgoff_t entry_start;
+};
+
+struct wait_exceptional_entry_queue {
+ wait_queue_entry_t wait;
+ struct exceptional_entry_key key;
+};
+
+/**
+ * enum dax_wake_mode: waitqueue wakeup behaviour
+ * @WAKE_ALL: wake all waiters in the waitqueue
+ * @WAKE_NEXT: wake only the first waiter in the waitqueue
+ */
+enum dax_wake_mode {
+ WAKE_ALL,
+ WAKE_NEXT,
+};
+
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry,
+ struct exceptional_entry_key *key)
+{
+ unsigned long hash;
+ unsigned long index = xas->xa_index;
+
+ /*
+ * If 'entry' is a PMD, align the 'index' that we use for the wait
+ * queue to the start of that PMD. This ensures that all offsets in
+ * the range covered by the PMD map to the same bit lock.
+ */
+ if (dax_is_pmd_entry(entry))
+ index &= ~PG_PMD_COLOUR;
+ key->xa = xas->xa;
+ key->entry_start = index;
+
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
+ return wait_table + hash;
+}
+
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *keyp)
+{
+ struct exceptional_entry_key *key = keyp;
+ struct wait_exceptional_entry_queue *ewait =
+ container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+ if (key->xa != ewait->key.xa ||
+ key->entry_start != ewait->key.entry_start)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * @entry may no longer be the entry at the index in the mapping.
+ * The important information it's conveying is whether the entry at
+ * this index used to be a PMD entry.
+ */
+static void dax_wake_entry(struct xa_state *xas, void *entry,
+ enum dax_wake_mode mode)
+{
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(xas, entry, &key);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under the i_pages lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
+}
+
+/*
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it. The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did. The entry returned may have a larger order than @order.
+ * If @order is larger than the order of the entry found in i_pages, this
+ * function returns a dax_is_conflict entry.
+ *
+ * Must be called with the i_pages lock held.
+ */
+static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+{
+ void *entry;
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ for (;;) {
+ entry = xas_find_conflict(xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ return entry;
+ if (dax_entry_order(entry) < order)
+ return XA_RETRY_ENTRY;
+ if (!dax_is_locked(entry))
+ return entry;
+
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ xas_unlock_irq(xas);
+ xas_reset(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ xas_lock_irq(xas);
+ }
+}
+
+/*
+ * The only thing keeping the address space around is the i_pages lock
+ * (it's cycled in clear_inode() after removing the entries from i_pages)
+ * After we call xas_unlock_irq(), we cannot touch xas->xa.
+ */
+static void wait_entry_unlocked(struct xa_state *xas, void *entry) __releases(xas)
+{
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ /*
+ * Unlike get_unlocked_entry() there is no guarantee that this
+ * path ever successfully retrieves an unlocked entry before an
+ * inode dies. Perform a non-exclusive wait in case this path
+ * never successfully performs its own wake up.
+ */
+ prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+ xas_unlock_irq(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+}
+
+static void put_unlocked_entry(struct xa_state *xas, void *entry,
+ enum dax_wake_mode mode)
+{
+ if (entry && !dax_is_conflict(entry))
+ dax_wake_entry(xas, entry, mode);
+}
+
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+void dax_unlock_entry(struct xa_state *xas, void *entry)
+{
+ void *old;
+
+ WARN_ON(dax_is_locked(entry));
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ old = xas_store(xas, entry);
+ xas_unlock_irq(xas);
+ WARN_ON(!dax_is_locked(old));
+ dax_wake_entry(xas, entry, WAKE_NEXT);
+}
+EXPORT_SYMBOL_NS_GPL(dax_unlock_entry, DAX);
+
+/*
+ * Return: The entry stored at this location before it was locked.
+ */
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
+{
+ unsigned long v = xa_to_value(entry);
+
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
+}
+
+static unsigned long dax_entry_size(void *entry)
+{
+ if (dax_is_zero_entry(entry))
+ return 0;
+ else if (dax_is_empty_entry(entry))
+ return 0;
+ else if (dax_is_pmd_entry(entry))
+ return PMD_SIZE;
+ else if (dax_is_pud_entry(entry))
+ return PUD_SIZE;
+ else
+ return PAGE_SIZE;
+}
+
+/*
+ * Until fsdax constructs compound folios it needs to be prepared to
+ * support multiple folios per entry where each folio is a single page
+ */
+static struct folio *dax_entry_to_folio(void *entry, int idx)
+{
+ unsigned long pfn, size = dax_entry_size(entry);
+ struct page *page;
+ struct folio *folio;
+
+ if (!size)
+ return NULL;
+
+ pfn = dax_to_pfn(entry);
+ page = pfn_to_page(pfn);
+ folio = page_folio(page);
+
+ /*
+ * Are there multiple folios per entry, and has the iterator
+ * passed the end of that set?
+ */
+ if (idx >= size / folio_size(folio))
+ return NULL;
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(size, folio_size(folio)));
+
+ return page_folio(page + idx);
+}
+
+/*
+ * Iterate through all folios associated with a given entry
+ */
+#define dax_for_each_folio(entry, folio, i) \
+ for (i = 0, folio = dax_entry_to_folio(entry, i); folio; \
+ folio = dax_entry_to_folio(entry, ++i))
+
+static bool dax_mapping_is_cow(struct address_space *mapping)
+{
+ return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
+/*
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static void dax_mapping_set_cow(struct folio *folio)
+{
+ if ((uintptr_t)folio->mapping != PAGE_MAPPING_DAX_COW) {
+ /*
+ * Reset the index if the folio was already mapped
+ * regularly before.
+ */
+ if (folio->mapping)
+ folio->index = 1;
+ folio->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ }
+ folio->index++;
+}
+
+static struct dev_pagemap *folio_pgmap(struct folio *folio)
+{
+ return folio_page(folio, 0)->pgmap;
+}
+
+/*
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files. If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ */
+static vm_fault_t dax_associate_entry(void *entry,
+ struct address_space *mapping,
+ struct vm_fault *vmf, unsigned long flags)
+{
+ unsigned long size = dax_entry_size(entry), index;
+ struct folio *folio;
+ int i;
+
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return 0;
+
+ index = linear_page_index(vmf->vma, ALIGN(vmf->address, size));
+ dax_for_each_folio(entry, folio, i)
+ if (flags & DAX_COW) {
+ dax_mapping_set_cow(folio);
+ } else {
+ struct dev_pagemap *pgmap = folio_pgmap(folio);
+ unsigned long pfn = page_to_pfn(&folio->page);
+
+ WARN_ON_ONCE(folio->mapping);
+ if (folio !=
+ pgmap_request_folio(pgmap,
+ pfn_to_pgmap_offset(pgmap, pfn),
+ folio_order(folio)))
+ return VM_FAULT_SIGBUS;
+ folio->mapping = mapping;
+ folio->index = index + i;
+ }
+
+ return 0;
+}
+
+static void dax_disassociate_entry(void *entry, struct address_space *mapping,
+ bool trunc)
+{
+ struct folio *folio;
+ int i;
+
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return;
+
+ dax_for_each_folio(entry, folio, i) {
+ if (dax_mapping_is_cow(folio->mapping)) {
+ /* keep the CoW flag if this folio is still shared */
+ if (folio->index-- > 0)
+ continue;
+ } else {
+ WARN_ON_ONCE(trunc && !dax_is_zapped(entry));
+ WARN_ON_ONCE(trunc && !dax_folio_idle(folio));
+ WARN_ON_ONCE(folio->mapping && folio->mapping != mapping);
+ }
+ folio->mapping = NULL;
+ folio->index = 0;
+ }
+}
+
+/*
+ * dax_lock_page - Lock the DAX entry corresponding to a page
+ * @page: The page whose entry we want to lock
+ *
+ * Context: Process context.
+ * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
+ * not be locked.
+ */
+dax_entry_t dax_lock_page(struct page *page)
+{
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+
+ /* Ensure page->mapping isn't freed while we look at it */
+ rcu_read_lock();
+ for (;;) {
+ struct address_space *mapping = READ_ONCE(page->mapping);
+
+ entry = NULL;
+ if (!mapping || !dax_mapping(mapping))
+ break;
+
+ /*
+ * In the device-dax case there's no need to lock, a
+ * struct dev_pagemap pin is sufficient to keep the
+ * inode alive, and we assume we have dev_pagemap pin
+ * otherwise we would not have a valid pfn_to_page()
+ * translation.
+ */
+ entry = (void *)~0UL;
+ if (S_ISCHR(mapping->host->i_mode))
+ break;
+
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
+ if (mapping != page->mapping) {
+ xas_unlock_irq(&xas);
+ continue;
+ }
+ xas_set(&xas, page->index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
+ continue;
+ }
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ break;
+ }
+ rcu_read_unlock();
+ return (dax_entry_t)entry;
+}
+
+void dax_unlock_page(struct page *page, dax_entry_t cookie)
+{
+ struct address_space *mapping = page->mapping;
+ XA_STATE(xas, &mapping->i_pages, page->index);
+
+ if (S_ISCHR(mapping->host->i_mode))
+ return;
+
+ dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ struct page **page)
+{
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+
+ rcu_read_lock();
+ for (;;) {
+ entry = NULL;
+ if (!dax_mapping(mapping))
+ break;
+
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
+ continue;
+ }
+ if (!entry || dax_is_zero_entry(entry) ||
+ dax_is_empty_entry(entry)) {
+ /*
+ * Because we are looking for entry from file's mapping
+ * and index, so the entry may not be inserted for now,
+ * or even a zero/empty entry. We don't think this is
+ * an error case. So, return a special value and do
+ * not output @page.
+ */
+ entry = (void *)~0UL;
+ } else {
+ *page = pfn_to_page(dax_to_pfn(entry));
+ dax_lock_entry(&xas, entry);
+ }
+ xas_unlock_irq(&xas);
+ break;
+ }
+ rcu_read_unlock();
+ return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ if (cookie == ~0UL)
+ return;
+
+ dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
+ *
+ * When requesting an entry with size DAX_PMD, dax_grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
+ *
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries. This happens for both PMD zero pages as
+ * well as PMD empty entries.
+ *
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them. We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ *
+ * On error, this function does not return an ERR_PTR. Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
+ * overlap with xarray value entries.
+ */
+void *dax_grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned int order)
+{
+ unsigned long index = xas->xa_index;
+ bool size_downgrade; /* splitting entry into PTE entries? */
+ void *entry;
+
+retry:
+ size_downgrade = false;
+ xas_lock_irq(xas);
+ entry = get_unlocked_entry(xas, order);
+
+ if (entry) {
+ if (dax_is_conflict(entry))
+ goto fallback;
+ if (!xa_is_value(entry)) {
+ xas_set_err(xas, -EIO);
+ goto out_unlock;
+ }
+
+ if (order == 0) {
+ if (!dax_is_pte_entry(entry) &&
+ (dax_is_zero_entry(entry) ||
+ dax_is_empty_entry(entry))) {
+ size_downgrade = true;
+ }
+ }
+ }
+
+ if (size_downgrade) {
+ unsigned long colour, nr;
+
+ if (dax_is_pmd_entry(entry)) {
+ colour = PG_PMD_COLOUR;
+ nr = PG_PMD_NR;
+ } else {
+ colour = PG_PUD_COLOUR;
+ nr = PG_PUD_NR;
+ }
+
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * the i_pages lock.
+ */
+ dax_lock_entry(xas, entry);
+
+ /*
+ * Besides huge zero pages the only other thing that gets
+ * downgraded are empty entries which don't need to be
+ * unmapped.
+ */
+ if (dax_is_zero_entry(entry)) {
+ xas_unlock_irq(xas);
+ unmap_mapping_pages(mapping, xas->xa_index & ~colour,
+ nr, false);
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ }
+
+ dax_disassociate_entry(entry, mapping, false);
+ xas_store(xas, NULL); /* undo the PMD join */
+ dax_wake_entry(xas, entry, WAKE_ALL);
+ mapping->nrpages -= nr;
+ entry = NULL;
+ xas_set(xas, index);
+ }
+
+ if (entry) {
+ dax_lock_entry(xas, entry);
+ } else {
+ unsigned long flags = DAX_EMPTY;
+
+ if (order == PUD_SHIFT - PAGE_SHIFT)
+ flags |= DAX_PUD;
+ else if (order == PMD_SHIFT - PAGE_SHIFT)
+ flags |= DAX_PMD;
+ entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+ dax_lock_entry(xas, entry);
+ if (xas_error(xas))
+ goto out_unlock;
+ mapping->nrpages += 1UL << order;
+ }
+
+out_unlock:
+ xas_unlock_irq(xas);
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+ goto retry;
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
+ return vmfault_to_dax_err(VM_FAULT_OOM);
+ if (xas_error(xas))
+ return vmfault_to_dax_err(VM_FAULT_SIGBUS);
+ return entry;
+fallback:
+ xas_unlock_irq(xas);
+ return vmfault_to_dax_err(VM_FAULT_FALLBACK);
+}
+EXPORT_SYMBOL_NS_GPL(dax_grab_mapping_entry, DAX);
+
+static void *dax_zap_entry(struct xa_state *xas, void *entry)
+{
+ unsigned long v = xa_to_value(entry);
+
+ return xas_store(xas, xa_mk_value(v | DAX_ZAP));
+}
+
+/*
+ * Return NULL if the entry is zapped and all pages in the entry are
+ * idle, otherwise return the non-idle page in the entry
+ */
+static struct page *dax_zap_pages(struct xa_state *xas, void *entry)
+{
+ struct page *ret = NULL;
+ struct folio *folio;
+ bool zap;
+ int i;
+
+ if (!dax_entry_size(entry))
+ return NULL;
+
+ zap = !dax_is_zapped(entry);
+
+ dax_for_each_folio(entry, folio, i) {
+ if (zap)
+ folio_put(folio);
+ if (!ret && !dax_folio_idle(folio))
+ ret = folio_page(folio, 0);
+ }
+
+ if (zap)
+ dax_zap_entry(xas, entry);
+
+ return ret;
+}
+
+/**
+ * dax_zap_mappings_range - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ * @start: Starting offset. Page containing 'start' is included.
+ * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
+ * pages from 'start' till the end of file are included.
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ void *entry;
+ unsigned int scanned = 0;
+ struct page *page = NULL;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
+
+ /*
+ * In the 'limited' case get_user_pages() for dax is disabled.
+ */
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return NULL;
+
+ if (!dax_mapping(mapping))
+ return NULL;
+
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
+ /*
+ * If we race get_user_pages_fast() here either we'll see the
+ * elevated page count in the iteration and wait, or
+ * get_user_pages_fast() will see that the page it took a reference
+ * against is no longer mapped in the page tables and bail to the
+ * get_user_pages() slow path. The slow path is protected by
+ * pte_lock() and pmd_lock(). New references are not taken without
+ * holding those locks, and unmap_mapping_pages() will not zero the
+ * pte or pmd without holding the respective lock, so we are
+ * guaranteed to either see new references or prevent new
+ * references from being established.
+ */
+ unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
+
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, end_idx) {
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
+ continue;
+ if (unlikely(dax_is_locked(entry)))
+ entry = get_unlocked_entry(&xas, 0);
+ if (entry)
+ page = dax_zap_pages(&xas, entry);
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
+ if (page)
+ break;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+ return page;
+}
+EXPORT_SYMBOL_GPL(dax_zap_mappings_range);
+
+struct page *dax_zap_mappings(struct address_space *mapping)
+{
+ return dax_zap_mappings_range(mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_zap_mappings);
+
+static int __dax_invalidate_entry(struct address_space *mapping,
+ pgoff_t index, bool trunc)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = 0;
+ void *entry;
+
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas, 0);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto out;
+ if (!trunc && (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
+ goto out;
+ dax_disassociate_entry(entry, mapping, trunc);
+ xas_store(&xas, NULL);
+ mapping->nrpages -= 1UL << dax_entry_order(entry);
+ ret = 1;
+out:
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
+ xas_unlock_irq(&xas);
+ return ret;
+}
+
+/*
+ * wait indefinitely for all pins to drop, the alternative to waiting is
+ * a potential use-after-free scenario
+ */
+void dax_break_layouts(struct address_space *mapping, pgoff_t index,
+ pgoff_t end)
+{
+ struct inode *inode = mapping->host;
+
+ /*
+ * To do this without filesystem locks, the inode needs to be
+ * unreferenced, or device-dax.
+ */
+ WARN_ON(atomic_read(&inode->i_count) && !S_ISCHR(inode->i_mode));
+ do {
+ struct page *page;
+
+ page = dax_zap_mappings_range(mapping, index << PAGE_SHIFT,
+ end << PAGE_SHIFT);
+ if (!page)
+ return;
+ wait_var_event(page, dax_page_idle(page));
+ } while (true);
+}
+
+/*
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+ int ret;
+
+ if (mapping_exiting(mapping))
+ dax_break_layouts(mapping, index, index + 1);
+
+ ret = __dax_invalidate_entry(mapping, index, true);
+
+ /*
+ * This gets called from truncate / punch_hole path. As such, the caller
+ * must hold locks protecting against concurrent modifications of the
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen a DAX entry for this index, we better find it
+ * at that index as well...
+ */
+ WARN_ON_ONCE(!ret);
+ return ret;
+}
+
+/*
+ * Invalidate DAX entry if it is clean.
+ */
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+ pgoff_t index)
+{
+ return __dax_invalidate_entry(mapping, index, false);
+}
+
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs. If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
+vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ void **pentry, pfn_t pfn, unsigned long flags)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = flags & DAX_DIRTY;
+ bool cow = flags & DAX_COW;
+ void *entry = *pentry;
+ vm_fault_t ret = 0;
+
+ if (dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+ unsigned long index = xas->xa_index;
+ /* we are replacing a zero page with block mapping */
+ if (dax_is_pud_entry(entry))
+ unmap_mapping_pages(mapping, index & ~PG_PUD_COLOUR,
+ PG_PUD_NR, false);
+ else if (dax_is_pmd_entry(entry))
+ unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
+ PG_PMD_NR, false);
+ else /* pte entry */
+ unmap_mapping_pages(mapping, index, 1, false);
+ }
+
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ void *old;
+
+ dax_disassociate_entry(entry, mapping, false);
+ ret = dax_associate_entry(new_entry, mapping, vmf, flags);
+ if (ret)
+ goto out;
+ /*
+ * Only swap our new entry into the page cache if the current
+ * entry is a zero page or an empty entry. If a normal PTE or
+ * PMD entry is already in the cache, we leave it alone. This
+ * means that if we are trying to insert a PTE and the
+ * existing entry is a PMD, we will just leave the PMD in the
+ * tree and dirty it if necessary.
+ */
+ old = dax_lock_entry(xas, new_entry);
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+ DAX_LOCKED));
+ entry = new_entry;
+ } else {
+ xas_load(xas); /* Walk the xa_state */
+ }
+
+ if (dirty)
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
+ *pentry = entry;
+out:
+ xas_unlock_irq(xas);
+
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(dax_insert_entry, DAX);
+
+int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry) __must_hold(xas)
+{
+ unsigned long pfn, index, count, end;
+ long ret = 0;
+ struct vm_area_struct *vma;
+
+ /*
+ * A page got tagged dirty in DAX mapping? Something is seriously
+ * wrong.
+ */
+ if (WARN_ON(!xa_is_value(entry)))
+ return -EIO;
+
+ if (unlikely(dax_is_locked(entry))) {
+ void *old_entry = entry;
+
+ entry = get_unlocked_entry(xas, 0);
+
+ /* Entry got punched out / reallocated? */
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback.
+ * We have to compare pfns as we must not bail out due to
+ * difference in lockbit or entry type.
+ */
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
+ ret = -EIO;
+ goto put_unlocked;
+ }
+
+ /* Another fsync thread may have already done this entry */
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
+ }
+
+ /* Lock the entry to serialize with page faults */
+ dax_lock_entry(xas, entry);
+
+ /*
+ * We can clear the tag now but we have to be careful so that concurrent
+ * dax_writeback_one() calls for the same index cannot finish before we
+ * actually flush the caches. This is achieved as the calls will look
+ * at the entry only under the i_pages lock and once they do that
+ * they will see the entry locked and wait for it to unlock.
+ */
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irq(xas);
+
+ /*
+ * If dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we use needs to be
+ * aligned to the start of the PMD.
+ * This allows us to flush for PMD_SIZE and not have to worry about
+ * partial PMD writebacks.
+ */
+ pfn = dax_to_pfn(entry);
+ count = 1UL << dax_entry_order(entry);
+ index = xas->xa_index & ~(count - 1);
+ end = index + count - 1;
+
+ /* Walk all mappings of a given index of a file and writeprotect them */
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ pfn_mkclean_range(pfn, count, index, vma);
+ cond_resched();
+ }
+ i_mmap_unlock_read(mapping);
+
+ dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
+ /*
+ * After we have flushed the cache, we can clear the dirty tag. There
+ * cannot be new dirty data in the pfn after the flush has completed as
+ * the pfn mappings are writeprotected and fault waits for mapping
+ * entry lock.
+ */
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ xas_store(xas, entry);
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+ dax_wake_entry(xas, entry, WAKE_NEXT);
+
+ trace_dax_writeback_one(mapping->host, index, count);
+ return ret;
+
+ put_unlocked:
+ put_unlocked_entry(xas, entry, WAKE_NEXT);
+ return ret;
+}
+
+/*
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pfn: PFN to insert
+ * @order: Order of entry to insert.
+ *
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
+ */
+vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
+ unsigned int order)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ void *entry;
+ vm_fault_t ret;
+
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas, order);
+ /* Did we race with someone splitting entry or so? */
+ if (!entry || dax_is_conflict(entry) ||
+ (order == 0 && !dax_is_pte_entry(entry))) {
+ put_unlocked_entry(&xas, entry, WAKE_NEXT);
+ xas_unlock_irq(&xas);
+ trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+ VM_FAULT_NOPAGE);
+ return VM_FAULT_NOPAGE;
+ }
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ if (order == 0)
+ ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+#ifdef CONFIG_FS_DAX_PMD
+ else if (order == PMD_ORDER)
+ ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+#endif
+ else if (order == PUD_ORDER)
+ ret = vmf_insert_pfn_pud(vmf, pfn, FAULT_FLAG_WRITE);
+ else
+ ret = VM_FAULT_FALLBACK;
+ dax_unlock_entry(&xas, entry);
+ trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+ return ret;
+}
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index da4438f3188c..866bd0c0ebee 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -42,13 +42,13 @@ static DEFINE_IDA(dax_minor_ida);
static struct kmem_cache *dax_cache __read_mostly;
static struct super_block *dax_superblock __read_mostly;
-int dax_read_lock(void)
+int dax_read_lock(void) __acquires(&dax_srcu)
{
return srcu_read_lock(&dax_srcu);
}
EXPORT_SYMBOL_GPL(dax_read_lock);
-void dax_read_unlock(int id)
+void dax_read_unlock(int id) __releases(&dax_srcu)
{
srcu_read_unlock(&dax_srcu, id);
}
@@ -475,7 +475,7 @@ EXPORT_SYMBOL_GPL(put_dax);
/**
* dax_holder() - obtain the holder of a dax device
* @dax_dev: a dax_device instance
-
+ *
* Return: the holder's data which represents the holder if registered,
* otherwize NULL.
*/
@@ -564,6 +564,8 @@ static int __init dax_core_init(void)
if (rc)
return rc;
+ dax_mapping_init();
+
rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
if (rc)
goto err_chrdev;
@@ -590,5 +592,5 @@ static void __exit dax_core_exit(void)
MODULE_AUTHOR("Intel Corporation");
MODULE_LICENSE("GPL v2");
-subsys_initcall(dax_core_init);
+fs_initcall(dax_core_init);
module_exit(dax_core_exit);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 8ef31d687ef3..4728be161828 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -255,7 +255,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str
* becoming writable and makes is_cow_mapping(vm_flags) false.
*/
if (is_cow_mapping(vma->vm_flags) &&
- !(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+ !(vma->vm_flags & VM_ACCESS_FLAGS))
vma->vm_flags &= ~VM_MAYWRITE;
return drm_gem_ttm_mmap(obj, vma);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index d4e6de2a6bf6..20d6b2578927 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -215,15 +215,18 @@ svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr)
return (addr + adev->kfd.dev->pgmap.range.start) >> PAGE_SHIFT;
}
-static void
-svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
+static void svm_migrate_get_vram_page(struct dev_pagemap *pgmap,
+ struct svm_range *prange,
+ unsigned long pfn)
{
+ struct folio *folio;
struct page *page;
- page = pfn_to_page(pfn);
+ folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
+ page = &folio->page;
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
- zone_device_page_init(page);
+ lock_page(page);
}
static void
@@ -297,6 +300,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
dma_addr_t *scratch)
{
+ struct kfd_dev *kfddev = adev->kfd.dev;
uint64_t npages = migrate->npages;
struct device *dev = adev->dev;
struct amdgpu_res_cursor cursor;
@@ -324,7 +328,8 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
dst[i] = cursor.start + (j << PAGE_SHIFT);
migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
- svm_migrate_get_vram_page(prange, migrate->dst[i]);
+ svm_migrate_get_vram_page(&kfddev->pgmap, prange,
+ migrate->dst[i]);
migrate->dst[i] = migrate_pfn(migrate->dst[i]);
spage = migrate_pfn_to_page(migrate->src[i]);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 789857faa048..91aebf3e007b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -307,6 +307,9 @@ static struct page *
nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
{
struct nouveau_dmem_chunk *chunk;
+ struct dev_pagemap *pgmap;
+ struct folio *folio;
+ unsigned long pfn;
struct page *page = NULL;
int ret;
@@ -316,16 +319,21 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
drm->dmem->free_pages = page->zone_device_data;
chunk = nouveau_page_to_chunk(page);
chunk->callocated++;
+ pfn = page_to_pfn(page);
spin_unlock(&drm->dmem->lock);
} else {
spin_unlock(&drm->dmem->lock);
ret = nouveau_dmem_chunk_alloc(drm, &page);
if (ret)
return NULL;
+ chunk = nouveau_page_to_chunk(page);
+ pfn = page_to_pfn(page);
}
- zone_device_page_init(page);
- return page;
+ pgmap = &chunk->pagemap;
+ folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
+ lock_page(&folio->page);
+ return &folio->page;
}
static void
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 5b1af40221ec..a86e5c8da1b1 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as,
* allocate page in a sleeping context if GFP flags permit. Hence
* spinlock needs to be unlocked and re-locked after allocation.
*/
- if (!(gfp & __GFP_ATOMIC))
+ if (gfp & __GFP_DIRECT_RECLAIM)
spin_unlock_irqrestore(&as->lock, *flags);
page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO);
- if (!(gfp & __GFP_ATOMIC))
+ if (gfp & __GFP_DIRECT_RECLAIM)
spin_lock_irqsave(&as->lock, *flags);
/*
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 5a29046e3319..24bdc87a4b99 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -19,7 +19,7 @@ if LIBNVDIMM
config BLK_DEV_PMEM
tristate "PMEM: Persistent memory block device support"
default LIBNVDIMM
- select DAX
+ select DAX if MMU
select ND_BTT if BTT
select ND_PFN if NVDIMM_PFN
help
@@ -78,6 +78,7 @@ config NVDIMM_DAX
bool "NVDIMM DAX: Raw access to persistent memory"
default LIBNVDIMM
depends on NVDIMM_PFN
+ depends on DAX
help
Support raw device dax access to a persistent memory
namespace. For environments that want to hard partition
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 96e6e9a5f235..3c63dc2cdc81 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -468,6 +468,32 @@ static const struct dev_pagemap_ops fsdax_pagemap_ops = {
.memory_failure = pmem_pagemap_memory_failure,
};
+static int setup_dax(struct pmem_device *pmem, struct gendisk *disk,
+ struct nd_region *nd_region)
+{
+ struct dax_device *dax_dev;
+ int rc;
+
+ dax_dev = alloc_dax(pmem, &pmem_dax_ops);
+ if (IS_ERR(dax_dev))
+ return PTR_ERR(dax_dev);
+ if (!dax_dev)
+ return 0;
+ set_dax_nocache(dax_dev);
+ set_dax_nomc(dax_dev);
+ if (is_nvdimm_sync(nd_region))
+ set_dax_synchronous(dax_dev);
+ rc = dax_add_host(dax_dev, disk);
+ if (rc) {
+ kill_dax(dax_dev);
+ put_dax(dax_dev);
+ return rc;
+ }
+ dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
+ pmem->dax_dev = dax_dev;
+ return 0;
+}
+
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
@@ -477,7 +503,6 @@ static int pmem_attach_disk(struct device *dev,
struct resource *res = &nsio->res;
struct range bb_range;
struct nd_pfn *nd_pfn = NULL;
- struct dax_device *dax_dev;
struct nd_pfn_sb *pfn_sb;
struct pmem_device *pmem;
struct request_queue *q;
@@ -578,24 +603,13 @@ static int pmem_attach_disk(struct device *dev,
nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_range);
disk->bb = &pmem->bb;
- dax_dev = alloc_dax(pmem, &pmem_dax_ops);
- if (IS_ERR(dax_dev)) {
- rc = PTR_ERR(dax_dev);
- goto out;
- }
- set_dax_nocache(dax_dev);
- set_dax_nomc(dax_dev);
- if (is_nvdimm_sync(nd_region))
- set_dax_synchronous(dax_dev);
- rc = dax_add_host(dax_dev, disk);
+ rc = setup_dax(pmem, disk, nd_region);
if (rc)
- goto out_cleanup_dax;
- dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
- pmem->dax_dev = dax_dev;
+ goto out;
rc = device_add_disk(dev, disk, pmem_attribute_groups);
if (rc)
- goto out_remove_host;
+ goto out_dax;
if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
return -ENOMEM;
@@ -607,9 +621,8 @@ static int pmem_attach_disk(struct device *dev,
dev_warn(dev, "'badblocks' notification disabled\n");
return 0;
-out_remove_host:
+out_dax:
dax_remove_host(pmem->disk);
-out_cleanup_dax:
kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
out:
diff --git a/fs/coredump.c b/fs/coredump.c
index a133103eb721..d57a8ea5a5cb 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
err = cn_printf(cn, "%lu",
rlimit(RLIMIT_CORE));
break;
+ /* CPU the task ran on */
+ case 'C':
+ err = cn_printf(cn, "%d", cprm->cpu);
+ break;
default:
break;
}
@@ -534,6 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
*/
.mm_flags = mm->flags,
.vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
};
audit_core_dumps(siginfo->si_signo);
diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..b4953248add6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -27,782 +27,8 @@
#include <linux/rmap.h>
#include <asm/pgalloc.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
-static inline unsigned int pe_order(enum page_entry_size pe_size)
-{
- if (pe_size == PE_SIZE_PTE)
- return PAGE_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PUD)
- return PUD_SHIFT - PAGE_SHIFT;
- return ~0;
-}
-
-/* We choose 4096 entries - same as per-zone page wait tables */
-#define DAX_WAIT_TABLE_BITS 12
-#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
-
-/* The 'colour' (ie low bits) within a PMD of a page offset. */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
-
-/* The order of a PMD entry */
-#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
-
-static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
-
-static int __init init_dax_wait_table(void)
-{
- int i;
-
- for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
- init_waitqueue_head(wait_table + i);
- return 0;
-}
-fs_initcall(init_dax_wait_table);
-
-/*
- * DAX pagecache entries use XArray value entries so they can't be mistaken
- * for pages. We use one bit for locking, one bit for the entry size (PMD)
- * and two more to tell us if the entry is a zero page or an empty entry that
- * is just used for locking. In total four special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
- * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define DAX_SHIFT (4)
-#define DAX_LOCKED (1UL << 0)
-#define DAX_PMD (1UL << 1)
-#define DAX_ZERO_PAGE (1UL << 2)
-#define DAX_EMPTY (1UL << 3)
-
-static unsigned long dax_to_pfn(void *entry)
-{
- return xa_to_value(entry) >> DAX_SHIFT;
-}
-
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
-{
- return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
-}
-
-static bool dax_is_locked(void *entry)
-{
- return xa_to_value(entry) & DAX_LOCKED;
-}
-
-static unsigned int dax_entry_order(void *entry)
-{
- if (xa_to_value(entry) & DAX_PMD)
- return PMD_ORDER;
- return 0;
-}
-
-static unsigned long dax_is_pmd_entry(void *entry)
-{
- return xa_to_value(entry) & DAX_PMD;
-}
-
-static bool dax_is_pte_entry(void *entry)
-{
- return !(xa_to_value(entry) & DAX_PMD);
-}
-
-static int dax_is_zero_entry(void *entry)
-{
- return xa_to_value(entry) & DAX_ZERO_PAGE;
-}
-
-static int dax_is_empty_entry(void *entry)
-{
- return xa_to_value(entry) & DAX_EMPTY;
-}
-
-/*
- * true if the entry that was found is of a smaller order than the entry
- * we were looking for
- */
-static bool dax_is_conflict(void *entry)
-{
- return entry == XA_RETRY_ENTRY;
-}
-
-/*
- * DAX page cache entry locking
- */
-struct exceptional_entry_key {
- struct xarray *xa;
- pgoff_t entry_start;
-};
-
-struct wait_exceptional_entry_queue {
- wait_queue_entry_t wait;
- struct exceptional_entry_key key;
-};
-
-/**
- * enum dax_wake_mode: waitqueue wakeup behaviour
- * @WAKE_ALL: wake all waiters in the waitqueue
- * @WAKE_NEXT: wake only the first waiter in the waitqueue
- */
-enum dax_wake_mode {
- WAKE_ALL,
- WAKE_NEXT,
-};
-
-static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
- void *entry, struct exceptional_entry_key *key)
-{
- unsigned long hash;
- unsigned long index = xas->xa_index;
-
- /*
- * If 'entry' is a PMD, align the 'index' that we use for the wait
- * queue to the start of that PMD. This ensures that all offsets in
- * the range covered by the PMD map to the same bit lock.
- */
- if (dax_is_pmd_entry(entry))
- index &= ~PG_PMD_COLOUR;
- key->xa = xas->xa;
- key->entry_start = index;
-
- hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
- return wait_table + hash;
-}
-
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
- unsigned int mode, int sync, void *keyp)
-{
- struct exceptional_entry_key *key = keyp;
- struct wait_exceptional_entry_queue *ewait =
- container_of(wait, struct wait_exceptional_entry_queue, wait);
-
- if (key->xa != ewait->key.xa ||
- key->entry_start != ewait->key.entry_start)
- return 0;
- return autoremove_wake_function(wait, mode, sync, NULL);
-}
-
-/*
- * @entry may no longer be the entry at the index in the mapping.
- * The important information it's conveying is whether the entry at
- * this index used to be a PMD entry.
- */
-static void dax_wake_entry(struct xa_state *xas, void *entry,
- enum dax_wake_mode mode)
-{
- struct exceptional_entry_key key;
- wait_queue_head_t *wq;
-
- wq = dax_entry_waitqueue(xas, entry, &key);
-
- /*
- * Checking for locked entry and prepare_to_wait_exclusive() happens
- * under the i_pages lock, ditto for entry handling in our callers.
- * So at this point all tasks that could have seen our entry locked
- * must be in the waitqueue and the following check will see them.
- */
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
-}
-
-/*
- * Look up entry in page cache, wait for it to become unlocked if it
- * is a DAX entry and return it. The caller must subsequently call
- * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
- * if it did. The entry returned may have a larger order than @order.
- * If @order is larger than the order of the entry found in i_pages, this
- * function returns a dax_is_conflict entry.
- *
- * Must be called with the i_pages lock held.
- */
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
-{
- void *entry;
- struct wait_exceptional_entry_queue ewait;
- wait_queue_head_t *wq;
-
- init_wait(&ewait.wait);
- ewait.wait.func = wake_exceptional_entry_func;
-
- for (;;) {
- entry = xas_find_conflict(xas);
- if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
- return entry;
- if (dax_entry_order(entry) < order)
- return XA_RETRY_ENTRY;
- if (!dax_is_locked(entry))
- return entry;
-
- wq = dax_entry_waitqueue(xas, entry, &ewait.key);
- prepare_to_wait_exclusive(wq, &ewait.wait,
- TASK_UNINTERRUPTIBLE);
- xas_unlock_irq(xas);
- xas_reset(xas);
- schedule();
- finish_wait(wq, &ewait.wait);
- xas_lock_irq(xas);
- }
-}
-
-/*
- * The only thing keeping the address space around is the i_pages lock
- * (it's cycled in clear_inode() after removing the entries from i_pages)
- * After we call xas_unlock_irq(), we cannot touch xas->xa.
- */
-static void wait_entry_unlocked(struct xa_state *xas, void *entry)
-{
- struct wait_exceptional_entry_queue ewait;
- wait_queue_head_t *wq;
-
- init_wait(&ewait.wait);
- ewait.wait.func = wake_exceptional_entry_func;
-
- wq = dax_entry_waitqueue(xas, entry, &ewait.key);
- /*
- * Unlike get_unlocked_entry() there is no guarantee that this
- * path ever successfully retrieves an unlocked entry before an
- * inode dies. Perform a non-exclusive wait in case this path
- * never successfully performs its own wake up.
- */
- prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
- xas_unlock_irq(xas);
- schedule();
- finish_wait(wq, &ewait.wait);
-}
-
-static void put_unlocked_entry(struct xa_state *xas, void *entry,
- enum dax_wake_mode mode)
-{
- if (entry && !dax_is_conflict(entry))
- dax_wake_entry(xas, entry, mode);
-}
-
-/*
- * We used the xa_state to get the entry, but then we locked the entry and
- * dropped the xa_lock, so we know the xa_state is stale and must be reset
- * before use.
- */
-static void dax_unlock_entry(struct xa_state *xas, void *entry)
-{
- void *old;
-
- BUG_ON(dax_is_locked(entry));
- xas_reset(xas);
- xas_lock_irq(xas);
- old = xas_store(xas, entry);
- xas_unlock_irq(xas);
- BUG_ON(!dax_is_locked(old));
- dax_wake_entry(xas, entry, WAKE_NEXT);
-}
-
-/*
- * Return: The entry stored at this location before it was locked.
- */
-static void *dax_lock_entry(struct xa_state *xas, void *entry)
-{
- unsigned long v = xa_to_value(entry);
- return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
-}
-
-static unsigned long dax_entry_size(void *entry)
-{
- if (dax_is_zero_entry(entry))
- return 0;
- else if (dax_is_empty_entry(entry))
- return 0;
- else if (dax_is_pmd_entry(entry))
- return PMD_SIZE;
- else
- return PAGE_SIZE;
-}
-
-static unsigned long dax_end_pfn(void *entry)
-{
- return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
-}
-
-/*
- * Iterate through all mapped pfns represented by an entry, i.e. skip
- * 'empty' and 'zero' entries.
- */
-#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_to_pfn(entry); \
- pfn < dax_end_pfn(entry); pfn++)
-
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
-{
- return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
-}
-
-/*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
- */
-static inline void dax_mapping_set_cow(struct page *page)
-{
- if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
- /*
- * Reset the index if the page was already mapped
- * regularly before.
- */
- if (page->mapping)
- page->index = 1;
- page->mapping = (void *)PAGE_MAPPING_DAX_COW;
- }
- page->index++;
-}
-
-/*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
- * whether this entry is shared by multiple files. If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool cow)
-{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- if (cow) {
- dax_mapping_set_cow(page);
- } else {
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
- }
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
-{
- unsigned long pfn;
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return;
-
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_mapping_is_cow(page->mapping)) {
- /* keep the CoW flag if this page is still shared */
- if (page->index-- > 0)
- continue;
- } else
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
-}
-
-static struct page *dax_busy_page(void *entry)
-{
- unsigned long pfn;
-
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- if (page_ref_count(page) > 1)
- return page;
- }
- return NULL;
-}
-
-/*
- * dax_lock_page - Lock the DAX entry corresponding to a page
- * @page: The page whose entry we want to lock
- *
- * Context: Process context.
- * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
- * not be locked.
- */
-dax_entry_t dax_lock_page(struct page *page)
-{
- XA_STATE(xas, NULL, 0);
- void *entry;
-
- /* Ensure page->mapping isn't freed while we look at it */
- rcu_read_lock();
- for (;;) {
- struct address_space *mapping = READ_ONCE(page->mapping);
-
- entry = NULL;
- if (!mapping || !dax_mapping(mapping))
- break;
-
- /*
- * In the device-dax case there's no need to lock, a
- * struct dev_pagemap pin is sufficient to keep the
- * inode alive, and we assume we have dev_pagemap pin
- * otherwise we would not have a valid pfn_to_page()
- * translation.
- */
- entry = (void *)~0UL;
- if (S_ISCHR(mapping->host->i_mode))
- break;
-
- xas.xa = &mapping->i_pages;
- xas_lock_irq(&xas);
- if (mapping != page->mapping) {
- xas_unlock_irq(&xas);
- continue;
- }
- xas_set(&xas, page->index);
- entry = xas_load(&xas);
- if (dax_is_locked(entry)) {
- rcu_read_unlock();
- wait_entry_unlocked(&xas, entry);
- rcu_read_lock();
- continue;
- }
- dax_lock_entry(&xas, entry);
- xas_unlock_irq(&xas);
- break;
- }
- rcu_read_unlock();
- return (dax_entry_t)entry;
-}
-
-void dax_unlock_page(struct page *page, dax_entry_t cookie)
-{
- struct address_space *mapping = page->mapping;
- XA_STATE(xas, &mapping->i_pages, page->index);
-
- if (S_ISCHR(mapping->host->i_mode))
- return;
-
- dax_unlock_entry(&xas, (void *)cookie);
-}
-
-/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
- * @mapping: the file's mapping whose entry we want to lock
- * @index: the offset within this file
- * @page: output the dax page corresponding to this dax entry
- *
- * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
- * could not be locked.
- */
-dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
- struct page **page)
-{
- XA_STATE(xas, NULL, 0);
- void *entry;
-
- rcu_read_lock();
- for (;;) {
- entry = NULL;
- if (!dax_mapping(mapping))
- break;
-
- xas.xa = &mapping->i_pages;
- xas_lock_irq(&xas);
- xas_set(&xas, index);
- entry = xas_load(&xas);
- if (dax_is_locked(entry)) {
- rcu_read_unlock();
- wait_entry_unlocked(&xas, entry);
- rcu_read_lock();
- continue;
- }
- if (!entry ||
- dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
- /*
- * Because we are looking for entry from file's mapping
- * and index, so the entry may not be inserted for now,
- * or even a zero/empty entry. We don't think this is
- * an error case. So, return a special value and do
- * not output @page.
- */
- entry = (void *)~0UL;
- } else {
- *page = pfn_to_page(dax_to_pfn(entry));
- dax_lock_entry(&xas, entry);
- }
- xas_unlock_irq(&xas);
- break;
- }
- rcu_read_unlock();
- return (dax_entry_t)entry;
-}
-
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
- dax_entry_t cookie)
-{
- XA_STATE(xas, &mapping->i_pages, index);
-
- if (cookie == ~0UL)
- return;
-
- dax_unlock_entry(&xas, (void *)cookie);
-}
-
-/*
- * Find page cache entry at given index. If it is a DAX entry, return it
- * with the entry locked. If the page cache doesn't contain an entry at
- * that index, add a locked empty entry.
- *
- * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return VM_FAULT_FALLBACK.
- * This will happen if there are any PTE entries within the PMD range
- * that we are requesting.
- *
- * We always favor PTE entries over PMD entries. There isn't a flow where we
- * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
- * insertion will fail if it finds any PTE entries already in the tree, and a
- * PTE insertion will cause an existing PMD entry to be unmapped and
- * downgraded to PTE entries. This happens for both PMD zero pages as
- * well as PMD empty entries.
- *
- * The exception to this downgrade path is for PMD entries that have
- * real storage backing them. We will leave these real PMD entries in
- * the tree, and PTE writes will simply dirty the entire PMD entry.
- *
- * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
- * persistent memory the benefit is doubtful. We can add that later if we can
- * show it helps.
- *
- * On error, this function does not return an ERR_PTR. Instead it returns
- * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
- * overlap with xarray value entries.
- */
-static void *grab_mapping_entry(struct xa_state *xas,
- struct address_space *mapping, unsigned int order)
-{
- unsigned long index = xas->xa_index;
- bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
- void *entry;
-
-retry:
- pmd_downgrade = false;
- xas_lock_irq(xas);
- entry = get_unlocked_entry(xas, order);
-
- if (entry) {
- if (dax_is_conflict(entry))
- goto fallback;
- if (!xa_is_value(entry)) {
- xas_set_err(xas, -EIO);
- goto out_unlock;
- }
-
- if (order == 0) {
- if (dax_is_pmd_entry(entry) &&
- (dax_is_zero_entry(entry) ||
- dax_is_empty_entry(entry))) {
- pmd_downgrade = true;
- }
- }
- }
-
- if (pmd_downgrade) {
- /*
- * Make sure 'entry' remains valid while we drop
- * the i_pages lock.
- */
- dax_lock_entry(xas, entry);
-
- /*
- * Besides huge zero pages the only other thing that gets
- * downgraded are empty entries which don't need to be
- * unmapped.
- */
- if (dax_is_zero_entry(entry)) {
- xas_unlock_irq(xas);
- unmap_mapping_pages(mapping,
- xas->xa_index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
- xas_reset(xas);
- xas_lock_irq(xas);
- }
-
- dax_disassociate_entry(entry, mapping, false);
- xas_store(xas, NULL); /* undo the PMD join */
- dax_wake_entry(xas, entry, WAKE_ALL);
- mapping->nrpages -= PG_PMD_NR;
- entry = NULL;
- xas_set(xas, index);
- }
-
- if (entry) {
- dax_lock_entry(xas, entry);
- } else {
- unsigned long flags = DAX_EMPTY;
-
- if (order > 0)
- flags |= DAX_PMD;
- entry = dax_make_entry(pfn_to_pfn_t(0), flags);
- dax_lock_entry(xas, entry);
- if (xas_error(xas))
- goto out_unlock;
- mapping->nrpages += 1UL << order;
- }
-
-out_unlock:
- xas_unlock_irq(xas);
- if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
- goto retry;
- if (xas->xa_node == XA_ERROR(-ENOMEM))
- return xa_mk_internal(VM_FAULT_OOM);
- if (xas_error(xas))
- return xa_mk_internal(VM_FAULT_SIGBUS);
- return entry;
-fallback:
- xas_unlock_irq(xas);
- return xa_mk_internal(VM_FAULT_FALLBACK);
-}
-
-/**
- * dax_layout_busy_page_range - find first pinned page in @mapping
- * @mapping: address space to scan for a page with ref count > 1
- * @start: Starting offset. Page containing 'start' is included.
- * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
- * pages from 'start' till the end of file are included.
- *
- * DAX requires ZONE_DEVICE mapped pages. These pages are never
- * 'onlined' to the page allocator so they are considered idle when
- * page->count == 1. A filesystem uses this interface to determine if
- * any page in the mapping is busy, i.e. for DMA, or other
- * get_user_pages() usages.
- *
- * It is expected that the filesystem is holding locks to block the
- * establishment of new mappings in this address_space. I.e. it expects
- * to be able to run unmap_mapping_range() and subsequently not race
- * mapping_mapped() becoming true.
- */
-struct page *dax_layout_busy_page_range(struct address_space *mapping,
- loff_t start, loff_t end)
-{
- void *entry;
- unsigned int scanned = 0;
- struct page *page = NULL;
- pgoff_t start_idx = start >> PAGE_SHIFT;
- pgoff_t end_idx;
- XA_STATE(xas, &mapping->i_pages, start_idx);
-
- /*
- * In the 'limited' case get_user_pages() for dax is disabled.
- */
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return NULL;
-
- if (!dax_mapping(mapping) || !mapping_mapped(mapping))
- return NULL;
-
- /* If end == LLONG_MAX, all pages from start to till end of file */
- if (end == LLONG_MAX)
- end_idx = ULONG_MAX;
- else
- end_idx = end >> PAGE_SHIFT;
- /*
- * If we race get_user_pages_fast() here either we'll see the
- * elevated page count in the iteration and wait, or
- * get_user_pages_fast() will see that the page it took a reference
- * against is no longer mapped in the page tables and bail to the
- * get_user_pages() slow path. The slow path is protected by
- * pte_lock() and pmd_lock(). New references are not taken without
- * holding those locks, and unmap_mapping_pages() will not zero the
- * pte or pmd without holding the respective lock, so we are
- * guaranteed to either see new references or prevent new
- * references from being established.
- */
- unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
-
- xas_lock_irq(&xas);
- xas_for_each(&xas, entry, end_idx) {
- if (WARN_ON_ONCE(!xa_is_value(entry)))
- continue;
- if (unlikely(dax_is_locked(entry)))
- entry = get_unlocked_entry(&xas, 0);
- if (entry)
- page = dax_busy_page(entry);
- put_unlocked_entry(&xas, entry, WAKE_NEXT);
- if (page)
- break;
- if (++scanned % XA_CHECK_SCHED)
- continue;
-
- xas_pause(&xas);
- xas_unlock_irq(&xas);
- cond_resched();
- xas_lock_irq(&xas);
- }
- xas_unlock_irq(&xas);
- return page;
-}
-EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
-
-struct page *dax_layout_busy_page(struct address_space *mapping)
-{
- return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
-}
-EXPORT_SYMBOL_GPL(dax_layout_busy_page);
-
-static int __dax_invalidate_entry(struct address_space *mapping,
- pgoff_t index, bool trunc)
-{
- XA_STATE(xas, &mapping->i_pages, index);
- int ret = 0;
- void *entry;
-
- xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, 0);
- if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
- goto out;
- if (!trunc &&
- (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
- xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
- goto out;
- dax_disassociate_entry(entry, mapping, trunc);
- xas_store(&xas, NULL);
- mapping->nrpages -= 1UL << dax_entry_order(entry);
- ret = 1;
-out:
- put_unlocked_entry(&xas, entry, WAKE_ALL);
- xas_unlock_irq(&xas);
- return ret;
-}
-
-/*
- * Delete DAX entry at @index from @mapping. Wait for it
- * to be unlocked before deleting it.
- */
-int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- int ret = __dax_invalidate_entry(mapping, index, true);
-
- /*
- * This gets called from truncate / punch_hole path. As such, the caller
- * must hold locks protecting against concurrent modifications of the
- * page cache (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen a DAX entry for this index, we better find it
- * at that index as well...
- */
- WARN_ON_ONCE(!ret);
- return ret;
-}
-
-/*
- * Invalidate DAX entry if it is clean.
- */
-int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
- pgoff_t index)
-{
- return __dax_invalidate_entry(mapping, index, false);
-}
-
static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
@@ -830,180 +56,6 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
}
/*
- * MAP_SYNC on a dax mapping guarantees dirty metadata is
- * flushed on write-faults (non-cow), but not read-faults.
- */
-static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
- struct vm_area_struct *vma)
-{
- return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
- (iter->iomap.flags & IOMAP_F_DIRTY);
-}
-
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
- return (iter->flags & IOMAP_WRITE) &&
- (iter->iomap.flags & IOMAP_F_SHARED);
-}
-
-/*
- * By this point grab_mapping_entry() has ensured that we have a locked entry
- * of the appropriate size so we don't have to worry about downgrading PMDs to
- * PTEs. If we happen to be trying to insert a PTE and there is a PMD
- * already in the tree, we will skip the insertion and just dirty the PMD as
- * appropriate.
- */
-static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap_iter *iter, void *entry, pfn_t pfn,
- unsigned long flags)
-{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- void *new_entry = dax_make_entry(pfn, flags);
- bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
- bool cow = dax_fault_is_cow(iter);
-
- if (dirty)
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
- if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
- unsigned long index = xas->xa_index;
- /* we are replacing a zero page with block mapping */
- if (dax_is_pmd_entry(entry))
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
- else /* pte entry */
- unmap_mapping_pages(mapping, index, 1, false);
- }
-
- xas_reset(xas);
- xas_lock_irq(xas);
- if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
- void *old;
-
- dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- cow);
- /*
- * Only swap our new entry into the page cache if the current
- * entry is a zero page or an empty entry. If a normal PTE or
- * PMD entry is already in the cache, we leave it alone. This
- * means that if we are trying to insert a PTE and the
- * existing entry is a PMD, we will just leave the PMD in the
- * tree and dirty it if necessary.
- */
- old = dax_lock_entry(xas, new_entry);
- WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
- DAX_LOCKED));
- entry = new_entry;
- } else {
- xas_load(xas); /* Walk the xa_state */
- }
-
- if (dirty)
- xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
-
- if (cow)
- xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
-
- xas_unlock_irq(xas);
- return entry;
-}
-
-static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
- struct address_space *mapping, void *entry)
-{
- unsigned long pfn, index, count, end;
- long ret = 0;
- struct vm_area_struct *vma;
-
- /*
- * A page got tagged dirty in DAX mapping? Something is seriously
- * wrong.
- */
- if (WARN_ON(!xa_is_value(entry)))
- return -EIO;
-
- if (unlikely(dax_is_locked(entry))) {
- void *old_entry = entry;
-
- entry = get_unlocked_entry(xas, 0);
-
- /* Entry got punched out / reallocated? */
- if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
- goto put_unlocked;
- /*
- * Entry got reallocated elsewhere? No need to writeback.
- * We have to compare pfns as we must not bail out due to
- * difference in lockbit or entry type.
- */
- if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
- goto put_unlocked;
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
- dax_is_zero_entry(entry))) {
- ret = -EIO;
- goto put_unlocked;
- }
-
- /* Another fsync thread may have already done this entry */
- if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
- goto put_unlocked;
- }
-
- /* Lock the entry to serialize with page faults */
- dax_lock_entry(xas, entry);
-
- /*
- * We can clear the tag now but we have to be careful so that concurrent
- * dax_writeback_one() calls for the same index cannot finish before we
- * actually flush the caches. This is achieved as the calls will look
- * at the entry only under the i_pages lock and once they do that
- * they will see the entry locked and wait for it to unlock.
- */
- xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
- xas_unlock_irq(xas);
-
- /*
- * If dax_writeback_mapping_range() was given a wbc->range_start
- * in the middle of a PMD, the 'index' we use needs to be
- * aligned to the start of the PMD.
- * This allows us to flush for PMD_SIZE and not have to worry about
- * partial PMD writebacks.
- */
- pfn = dax_to_pfn(entry);
- count = 1UL << dax_entry_order(entry);
- index = xas->xa_index & ~(count - 1);
- end = index + count - 1;
-
- /* Walk all mappings of a given index of a file and writeprotect them */
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
- pfn_mkclean_range(pfn, count, index, vma);
- cond_resched();
- }
- i_mmap_unlock_read(mapping);
-
- dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
- /*
- * After we have flushed the cache, we can clear the dirty tag. There
- * cannot be new dirty data in the pfn after the flush has completed as
- * the pfn mappings are writeprotected and fault waits for mapping
- * entry lock.
- */
- xas_reset(xas);
- xas_lock_irq(xas);
- xas_store(xas, entry);
- xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
- dax_wake_entry(xas, entry, WAKE_NEXT);
-
- trace_dax_writeback_one(mapping->host, index, count);
- return ret;
-
- put_unlocked:
- put_unlocked_entry(xas, entry, WAKE_NEXT);
- return ret;
-}
-
-/*
* Flush the mapping to the persistent domain within the byte range of [start,
* end]. This is required by data integrity operations to ensure file data is
* on persistent storage prior to completion of the operation.
@@ -1053,10 +105,9 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
size_t size, void **kaddr, pfn_t *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- int id, rc = 0;
long length;
+ int rc = 0;
- id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
@@ -1081,7 +132,6 @@ out_check_addr:
if (!*kaddr)
rc = -EFAULT;
out:
- dax_read_unlock(id);
return rc;
}
@@ -1140,6 +190,37 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
}
/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
+ struct vm_area_struct *vma)
+{
+ return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
+ (iter->iomap.flags & IOMAP_F_DIRTY);
+}
+
+static bool dax_fault_is_cow(const struct iomap_iter *iter)
+{
+ return (iter->flags & IOMAP_WRITE) &&
+ (iter->iomap.flags & IOMAP_F_SHARED);
+}
+
+static unsigned long dax_iter_flags(const struct iomap_iter *iter,
+ struct vm_fault *vmf)
+{
+ unsigned long flags = 0;
+
+ if (!dax_fault_is_synchronous(iter, vmf->vma))
+ flags |= DAX_DIRTY;
+
+ if (dax_fault_is_cow(iter))
+ flags |= DAX_COW;
+
+ return flags;
+}
+
+/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
* sparse files. Instead we insert a read-only mapping of the 4k zero page.
@@ -1154,9 +235,13 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
+ ret = dax_insert_entry(xas, vmf, entry, pfn,
+ DAX_ZERO_PAGE | dax_iter_flags(iter, vmf));
+ if (ret)
+ goto out;
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -1173,6 +258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
struct page *zero_page;
spinlock_t *ptl;
pmd_t pmd_entry;
+ vm_fault_t ret;
pfn_t pfn;
zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
@@ -1181,8 +267,11 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE);
+ ret = dax_insert_entry(xas, vmf, entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE |
+ dax_iter_flags(iter, vmf));
+ if (ret)
+ return ret;
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1537,7 +626,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
- int err = 0;
+ int err = 0, id;
+ vm_fault_t ret;
pfn_t pfn;
void *kaddr;
@@ -1557,11 +647,18 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
+ id = dax_read_lock();
err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
- if (err)
+ if (err) {
+ dax_read_unlock(id);
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
+ }
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
+ ret = dax_insert_entry(xas, vmf, entry, pfn,
+ entry_flags | dax_iter_flags(iter, vmf));
+ dax_read_unlock(id);
+ if (ret)
+ return ret;
if (write &&
srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
@@ -1612,9 +709,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
iter.flags |= IOMAP_WRITE;
- entry = grab_mapping_entry(&xas, mapping, 0);
- if (xa_is_internal(entry)) {
- ret = xa_to_internal(entry);
+ entry = dax_grab_mapping_entry(&xas, mapping, 0);
+ if (is_dax_err(entry)) {
+ ret = dax_err_to_vmfault(entry);
goto out;
}
@@ -1729,14 +826,14 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto fallback;
/*
- * grab_mapping_entry() will make sure we get an empty PMD entry,
+ * dax_grab_mapping_entry() will make sure we get an empty PMD entry,
* a zero PMD entry or a DAX PMD. If it can't (because a PTE
* entry is already in the array, for instance), it will return
* VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
- if (xa_is_internal(entry)) {
- ret = xa_to_internal(entry);
+ entry = dax_grab_mapping_entry(&xas, mapping, PMD_ORDER);
+ if (is_dax_err(entry)) {
+ ret = dax_err_to_vmfault(entry);
goto fallback;
}
@@ -1808,50 +905,6 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
-/*
- * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
- * @vmf: The description of the fault
- * @pfn: PFN to insert
- * @order: Order of entry to insert.
- *
- * This function inserts a writeable PTE or PMD entry into the page tables
- * for an mmaped DAX file. It also marks the page cache entry as dirty.
- */
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
-{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
- void *entry;
- vm_fault_t ret;
-
- xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, order);
- /* Did we race with someone splitting entry or so? */
- if (!entry || dax_is_conflict(entry) ||
- (order == 0 && !dax_is_pte_entry(entry))) {
- put_unlocked_entry(&xas, entry, WAKE_NEXT);
- xas_unlock_irq(&xas);
- trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
- VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
- }
- xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
- dax_lock_entry(&xas, entry);
- xas_unlock_irq(&xas);
- if (order == 0)
- ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
-#ifdef CONFIG_FS_DAX_PMD
- else if (order == PMD_ORDER)
- ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
-#endif
- else
- ret = VM_FAULT_FALLBACK;
- dax_unlock_entry(&xas, entry);
- trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
- return ret;
-}
-
/**
* dax_finish_sync_fault - finish synchronous page fault
* @vmf: The description of the fault
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddb3fc258df9..b54f470e0d03 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct dentry *dentry = F_DENTRY(file);
ssize_t ret;
@@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
- ret = simple_attr_write(file, buf, len, ppos);
+ if (is_signed)
+ ret = simple_attr_write_signed(file, buf, len, ppos);
+ else
+ ret = simple_attr_write(file, buf, len, ppos);
debugfs_file_put(dentry);
return ret;
}
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(debugfs_attr_write);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *value,
const struct file_operations *fops,
@@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
"%lld\n");
/**
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a8e12ce6673d..fca47470c85a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3964,14 +3964,13 @@ int ext4_break_layouts(struct inode *inode)
return -EINVAL;
do {
- page = dax_layout_busy_page(inode->i_mapping);
+ page = dax_zap_mappings(inode->i_mapping);
if (!page)
return 0;
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
+ error = ___wait_var_event(page, dax_page_idle(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 106fb06e24e8..34b78f380968 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2095,12 +2095,12 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (token) {
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
- if (!*param->string)
+ if (!param->string || !*param->string)
return unnote_qf_name(fc, USRQUOTA);
else
return note_qf_name(fc, USRQUOTA, param);
case Opt_grpjquota:
- if (!*param->string)
+ if (!param->string || !*param->string)
return unnote_qf_name(fc, GRPQUOTA);
else
return note_qf_name(fc, GRPQUOTA, param);
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 24ce12f0db32..df04e5fc6d66 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -96,7 +96,9 @@ int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
if (strcmp(param->key, "source") != 0)
return -ENOPARAM;
- if (param->type != fs_value_is_string)
+ /* source value may be NULL */
+ if (param->type != fs_value_is_string &&
+ param->type != fs_value_is_empty)
return invalf(fc, "Non-string source");
if (fc->source)
@@ -175,10 +177,15 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
};
if (value) {
- param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
- if (!param.string)
- return -ENOMEM;
- param.type = fs_value_is_string;
+ if (!v_size) {
+ param.string = NULL;
+ param.type = fs_value_is_empty;
+ } else {
+ param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
+ if (!param.string)
+ return -ENOMEM;
+ param.type = fs_value_is_string;
+ }
}
ret = vfs_parse_fs_param(fc, &param);
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ed40ce5742fd..2046f41ab00b 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -197,6 +197,8 @@ int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
int b;
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
if (!*param->string && (p->flags & fs_param_can_be_empty))
@@ -213,6 +215,8 @@ int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
int base = (unsigned long)p->data;
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
if (!*param->string && (p->flags & fs_param_can_be_empty))
@@ -226,6 +230,8 @@ EXPORT_SYMBOL(fs_param_is_u32);
int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
if (!*param->string && (p->flags & fs_param_can_be_empty))
@@ -239,6 +245,8 @@ EXPORT_SYMBOL(fs_param_is_s32);
int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
if (!*param->string && (p->flags & fs_param_can_be_empty))
@@ -253,6 +261,8 @@ int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
const struct constant_table *c;
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string)
return fs_param_bad_value(log, param);
if (!*param->string && (p->flags & fs_param_can_be_empty))
@@ -268,6 +278,8 @@ EXPORT_SYMBOL(fs_param_is_enum);
int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_string ||
(!*param->string && !(p->flags & fs_param_can_be_empty)))
return fs_param_bad_value(log, param);
@@ -278,6 +290,8 @@ EXPORT_SYMBOL(fs_param_is_string);
int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
+ if (param->type == fs_value_is_empty)
+ return 0;
if (param->type != fs_value_is_blob)
return fs_param_bad_value(log, param);
return 0;
@@ -287,6 +301,8 @@ EXPORT_SYMBOL(fs_param_is_blob);
int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
+ if (param->type == fs_value_is_empty)
+ return 0;
switch (param->type) {
case fs_value_is_string:
if ((!*param->string && !(p->flags & fs_param_can_be_empty)) ||
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index e23e802a8013..8cdc9402e8f7 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -443,7 +443,7 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
/*
* Can't do inline reclaim in fault path. We call
- * dax_layout_busy_page() before we free a range. And
+ * dax_zap_mappings() before we free a range. And
* fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
* In fault path we enter with mapping->invalidate_lock held and can't
* drop it. Also in fault path we hold mapping->invalidate_lock shared
@@ -671,14 +671,13 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
{
struct page *page;
- page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ page = dax_zap_mappings_range(inode->i_mapping, start, end);
if (!page)
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, fuse_wait_dax_page(inode));
+ return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
+ 0, 0, fuse_wait_dax_page(inode));
}
/* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dd54f67e47fd..3ee84604e36d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -328,6 +328,12 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
@@ -364,11 +370,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void hugetlb_delete_from_page_cache(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
{
- ClearPageDirty(page);
- ClearPageUptodate(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
}
/*
@@ -574,8 +580,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
* map could fail. Correspondingly, the subpool and global
* reserve usage count can need to be adjusted.
*/
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- hugetlb_delete_from_page_cache(&folio->page);
+ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+ hugetlb_delete_from_page_cache(folio);
ret = true;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode, index,
@@ -1091,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(&src->page)) {
- hugetlb_set_page_subpool(&dst->page,
- hugetlb_page_subpool(&src->page));
- hugetlb_set_page_subpool(&src->page, NULL);
+ if (hugetlb_folio_subpool(src)) {
+ hugetlb_set_folio_subpool(dst,
+ hugetlb_folio_subpool(src));
+ hugetlb_set_folio_subpool(src, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1111,13 +1117,6 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- hugetlb_delete_from_page_cache(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
@@ -1378,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1388,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
@@ -1404,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
diff --git a/fs/libfs.c b/fs/libfs.c
index 5ae81466a422..17ecc47696e1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -995,8 +995,8 @@ out:
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct simple_attr *attr;
unsigned long long val;
@@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- ret = kstrtoull(attr->set_buf, 0, &val);
+ if (is_signed)
+ ret = kstrtoll(attr->set_buf, 0, &val);
+ else
+ ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
@@ -1027,8 +1030,21 @@ out:
mutex_unlock(&attr->mutex);
return ret;
}
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(simple_attr_write);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b13d344d40b6..60b97c92e2b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
/* negotiate timeout must be less than write timeout. */
schedule_delayed_work(&reg->hr_nego_timeout_work,
msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
- memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
+ bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
}
static void o2hb_disarm_timeout(struct o2hb_region *reg)
@@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (reg->hr_last_hb_status)
return;
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
/* lowest node as master node to make negotiate decision. */
master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
@@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(master_node, reg->hr_nego_node_bitmap);
}
- if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
- sizeof(reg->hr_nego_node_bitmap))) {
+ if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+ O2NM_MAX_NODES)) {
/* check negotiate bitmap every second to do timeout
* approve decision.
*/
@@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* live nodes heartbeat on it. In other words, the region has been
* added to all nodes.
*/
- if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
- sizeof(o2hb_live_node_bitmap)))
+ if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ O2NM_MAX_NODES))
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
@@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* If a node is not configured but is in the livemap, we still need
* to read the slot so as to be able to remove it from the livemap.
*/
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
i = -1;
while ((i = find_next_bit(live_node_bitmap,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1437,11 +1437,11 @@ void o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
- memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
- memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
- memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
- memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+ bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
o2hb_dependent_users = 0;
@@ -1450,23 +1450,21 @@ void o2hb_init(void)
/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
- unsigned bytes)
+ unsigned int bits)
{
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memcpy(map, &o2hb_live_node_bitmap, bytes);
+ bitmap_copy(map, o2hb_live_node_bitmap, bits);
}
/*
* get a map of all nodes that are heartbeating in any regions
*/
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
{
/* callers want to serialize this map and callbacks so that they
* can trust that they don't miss nodes coming to the party */
down_read(&o2hb_callback_sem);
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(map, bytes);
+ o2hb_fill_node_map_from_callback(map, bits);
spin_unlock(&o2hb_live_lock);
up_read(&o2hb_callback_sem);
}
@@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num)
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
@@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 1d4100abf6f8..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid,
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
- unsigned bytes);
+ unsigned int bits);
void o2hb_exit(void);
void o2hb_init(void);
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7524994e3199..35c05c18de59 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len)
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
- o2net_fill_node_map(map, sizeof(map));
+ o2net_fill_node_map(map, O2NM_MAX_NODES);
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 27fee68f860a..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
- memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+ bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
read_unlock(&cluster->cl_nodes_lock);
return 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..6f5a3fb97c7f 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
}
/* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
{
struct o2net_sock_container *sc;
int node, ret;
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memset(map, 0, bytes);
+ bitmap_zero(map, bits);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
continue;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fd2022712167..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
static inline void dlm_node_iter_init(unsigned long *map,
struct dlm_node_iter *iter)
{
- memcpy(iter->node_map, map, sizeof(iter->node_map));
+ bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
iter->curnode = -1;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c4eccd499db8..5c04dde99981 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
- ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
+ ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+ O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
if (ret)
@@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+ o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
spin_lock(&dlm->spinlock);
- memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+ bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
__dlm_set_joining_node(dlm, dlm->node_num);
-
spin_unlock(&dlm->spinlock);
node = -1;
@@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
* yes_resp_map. Copy that into our domain map and send a join
* assert message to clean up everyone elses state. */
spin_lock(&dlm->spinlock);
- memcpy(dlm->domain_map, ctxt->yes_resp_map,
- sizeof(ctxt->yes_resp_map));
+ bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
@@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
dlm->recovery_map, &(dlm->recovery_map[0]));
- memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
- memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
- memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+ bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 227da5b1b6ab..d610da8e2f24 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
@@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
- memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
- memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+ bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+ bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
@@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
- memset(res->refmap, 0, sizeof(res->refmap));
+ bitmap_zero(res->refmap, O2NM_MAX_NODES);
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -1036,10 +1036,10 @@ recheck:
spin_lock(&mle->spinlock);
m = mle->master;
- map_changed = (memcmp(mle->vote_map, mle->node_map,
- sizeof(mle->vote_map)) != 0);
- voting_done = (memcmp(mle->vote_map, mle->response_map,
- sizeof(mle->vote_map)) == 0);
+ map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
+ voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+ O2NM_MAX_NODES);
/* restart if we hit any errors */
if (map_changed) {
@@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* now blank out everything, as if we had never
* contacted anyone */
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
/* reset the vote_map to the current node_map */
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
+ bitmap_copy(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
@@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
- memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+ bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
@@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
}
- memset(iter.node_map, 0, sizeof(iter.node_map));
+ bitmap_zero(iter.node_map, O2NM_MAX_NODES);
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 52ad342fec3e..50da8af988c1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_reco_node_data *ndata;
spin_lock(&dlm->spinlock);
- memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+ bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
/* nodes can only be removed (by dying) after dropping
* this lock, and death will be trapped later, so this should do */
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 88f75f7f02d7..c973c03f6fd8 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -273,17 +273,17 @@ static int o2cb_cluster_check(void)
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
- o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
- o2net_fill_node_map(netmap, sizeof(netmap));
+ o2net_fill_node_map(netmap, O2NM_MAX_NODES);
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
- if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..91fe1597af7b 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
@@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
static int __init proc_cmdline_init(void)
{
- proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 913bef0d2a36..fc46d6fe080c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -7,6 +7,7 @@
#include <linux/namei.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -279,6 +280,30 @@ out:
return 0;
}
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
static int proc_readfd(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
return rv;
}
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ int rv = 0;
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ /* If it's a directory, put the number of open fds there */
+ if (S_ISDIR(inode->i_mode)) {
+ rv = proc_readfd_count(inode, &stat->size);
+ if (rv < 0)
+ return rv;
+ }
+
+ return rv;
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dff921f7ca33..71157ee35c1a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -18,7 +18,6 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
- if (kern_addr_valid(start)) {
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- if (copy_from_kernel_nofault(buf, (void *)start,
- tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
}
} else {
- if (clear_user(buffer, tsz)) {
+ if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
@@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -694,7 +681,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..74747571d58e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1567,6 +1567,7 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
+ elfcorehdr_free(elfcorehdr_addr);
pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c6c80265c0b2..ca0afcdd98c0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -822,14 +822,13 @@ xfs_break_dax_layouts(
ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
- page = dax_layout_busy_page(inode->i_mapping);
+ page = dax_zap_mappings(inode->i_mapping);
if (!page)
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
+ return ___wait_var_event(page, dax_page_idle(page), TASK_INTERRUPTIBLE,
+ 0, 0, xfs_wait_dax_page(inode));
}
int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c000b74dd203..1f2eea4c394e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3480,8 +3480,8 @@ again:
* need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
* for this nested lock case.
*/
- page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
- if (page && page_ref_count(page) != 1) {
+ page = dax_zap_mappings(VFS_I(ip2)->i_mapping);
+ if (page) {
xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
goto again;
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index f55a37efdb97..7af9e34ec261 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -82,26 +82,21 @@
#define __noscs __attribute__((__no_sanitize__("shadow-call-stack")))
#endif
-#if __has_attribute(__no_sanitize_address__)
-#define __no_sanitize_address __attribute__((no_sanitize_address))
-#else
-#define __no_sanitize_address
-#endif
+#define __no_sanitize_address __attribute__((__no_sanitize_address__))
-#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__)
-#define __no_sanitize_thread __attribute__((no_sanitize_thread))
+#if defined(__SANITIZE_THREAD__)
+#define __no_sanitize_thread __attribute__((__no_sanitize_thread__))
#else
#define __no_sanitize_thread
#endif
-#if __has_attribute(__no_sanitize_undefined__)
-#define __no_sanitize_undefined __attribute__((no_sanitize_undefined))
-#else
-#define __no_sanitize_undefined
-#endif
+#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__))
+/*
+ * Only supported since gcc >= 12
+ */
#if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__)
-#define __no_sanitize_coverage __attribute__((no_sanitize_coverage))
+#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__))
#else
#define __no_sanitize_coverage
#endif
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 08a1d3e7e46d..191dcf5af6cb 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -22,6 +22,7 @@ struct coredump_params {
struct file *file;
unsigned long limit;
unsigned long mm_flags;
+ int cpu;
loff_t written;
loff_t pos;
loff_t to_skip;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index ba985333e26b..f4fc37933fc2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -157,31 +157,46 @@ static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc);
-struct page *dax_layout_busy_page(struct address_space *mapping);
-struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+#else
+static inline int dax_writeback_mapping_range(struct address_space *mapping,
+ struct dax_device *dax_dev, struct writeback_control *wbc)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
+
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops);
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops);
+
+#if IS_ENABLED(CONFIG_DAX)
+int dax_read_lock(void);
+void dax_read_unlock(int id);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
+void run_dax(struct dax_device *dax_dev);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
unsigned long index, dax_entry_t cookie);
+struct page *dax_zap_mappings(struct address_space *mapping);
+struct page *dax_zap_mappings_range(struct address_space *mapping, loff_t start,
+ loff_t end);
#else
-static inline struct page *dax_layout_busy_page(struct address_space *mapping)
+static inline struct page *dax_zap_mappings(struct address_space *mapping)
{
return NULL;
}
-static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+static inline struct page *dax_zap_mappings_range(struct address_space *mapping,
+ pgoff_t start,
+ pgoff_t nr_pages)
{
return NULL;
}
-static inline int dax_writeback_mapping_range(struct address_space *mapping,
- struct dax_device *dax_dev, struct writeback_control *wbc)
-{
- return -EOPNOTSUPP;
-}
-
static inline dax_entry_t dax_lock_page(struct page *page)
{
if (IS_DAX(page->mapping->host))
@@ -193,6 +208,15 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
{
}
+static inline int dax_read_lock(void)
+{
+ return 0;
+}
+
+static inline void dax_read_unlock(int id)
+{
+}
+
static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
unsigned long index, struct page **page)
{
@@ -205,24 +229,20 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping,
}
#endif
-int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops);
-int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops);
-
-#if IS_ENABLED(CONFIG_DAX)
-int dax_read_lock(void);
-void dax_read_unlock(int id);
-#else
-static inline int dax_read_lock(void)
+/*
+ * Document all the code locations that want know when a dax page is
+ * unreferenced.
+ */
+static inline bool dax_page_idle(struct page *page)
{
- return 0;
+ return page_ref_count(page) == 0;
}
-static inline void dax_read_unlock(int id)
+static inline bool dax_folio_idle(struct folio *folio)
{
+ return dax_page_idle(folio_page(folio, 0));
}
-#endif /* CONFIG_DAX */
+
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
@@ -243,9 +263,30 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
enum page_entry_size pe_size, pfn_t pfn);
+
+static inline bool is_dax_err(void *entry)
+{
+ return xa_is_internal(entry);
+}
+
+static inline vm_fault_t dax_err_to_vmfault(void *entry)
+{
+ return (vm_fault_t __force)(xa_to_internal(entry));
+}
+
+static inline void *vmfault_to_dax_err(vm_fault_t error)
+{
+ return xa_mk_internal((unsigned long __force)error);
+}
+
+void *dax_grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned int order);
+void dax_unlock_entry(struct xa_state *xas, void *entry);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
+void dax_break_layouts(struct address_space *mapping, pgoff_t index,
+ pgoff_t end);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same,
@@ -259,6 +300,67 @@ static inline bool dax_mapping(struct address_space *mapping)
return mapping->host && IS_DAX(mapping->host);
}
+/*
+ * DAX pagecache entries use XArray value entries so they can't be
+ * mistaken for pages. We use one bit for locking, two bits for the
+ * entry size (PMD, PUD) and two more to tell us if the entry is a zero
+ * page or an empty entry that is just used for locking. In total 5
+ * special bits which limits the max pfn that can be stored as:
+ * (1UL << 57 - PAGE_SHIFT). 63 - DAX_SHIFT - 1 (for xa_mk_value()).
+ *
+ * If the P{M,U}D bits are not set the entry has size PAGE_SIZE, and if
+ * the ZERO_PAGE and EMPTY bits aren't set the entry is a normal DAX
+ * entry with a filesystem block allocation.
+ */
+#define DAX_SHIFT (6)
+#define DAX_MASK ((1UL << DAX_SHIFT) - 1)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_PUD (1UL << 2)
+#define DAX_ZERO_PAGE (1UL << 3)
+#define DAX_EMPTY (1UL << 4)
+#define DAX_ZAP (1UL << 5)
+
+/*
+ * These flags are not conveyed in Xarray value entries, they are just
+ * modifiers to dax_insert_entry().
+ */
+#define DAX_DIRTY (1UL << (DAX_SHIFT + 0))
+#define DAX_COW (1UL << (DAX_SHIFT + 1))
+
+vm_fault_t dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ void **pentry, pfn_t pfn, unsigned long flags);
+vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn,
+ unsigned int order);
+int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry);
+
+#ifdef CONFIG_MMU
+/* The 'colour' (ie low bits) within a PMD of a page offset. */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
+/* The 'colour' (ie low bits) within a PUD of a page offset. */
+#define PG_PUD_COLOUR ((PUD_SIZE >> PAGE_SHIFT) - 1)
+#define PG_PUD_NR (PUD_SIZE >> PAGE_SHIFT)
+
+/* The order of a PUD entry */
+#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
+
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+ if (pe_size == PE_SIZE_PTE)
+ return PAGE_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PUD)
+ return PUD_SHIFT - PAGE_SHIFT;
+ return ~0;
+}
+
#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_device(int target_nid, struct resource *r);
#else
@@ -266,5 +368,6 @@ static inline void hmem_register_device(int target_nid, struct resource *r)
{
}
#endif
+#endif /* CONFIG_MMU */
#endif
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index f60674692d36..ea2d919fd9c7 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -45,7 +45,7 @@ struct debugfs_u32_array {
extern struct dentry *arch_debugfs_dir;
-#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \
+#define DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \
static int __fops ## _open(struct inode *inode, struct file *file) \
{ \
__simple_attr_check_format(__fmt, 0ull); \
@@ -56,10 +56,16 @@ static const struct file_operations __fops = { \
.open = __fops ## _open, \
.release = simple_attr_release, \
.read = debugfs_attr_read, \
- .write = debugfs_attr_write, \
+ .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \
.llseek = no_llseek, \
}
+#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \
+ DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)
+
+#define DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \
+ DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)
+
typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *);
#if defined(CONFIG_DEBUG_FS)
@@ -102,6 +108,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos);
ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos);
struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
struct dentry *new_dir, const char *new_name);
@@ -254,6 +262,13 @@ static inline ssize_t debugfs_attr_write(struct file *file,
return -ENODEV;
}
+static inline ssize_t debugfs_attr_write_signed(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return -ENODEV;
+}
+
static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
struct dentry *new_dir, char *new_name)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cffc402c2451..fa5ba1b1cbcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3453,7 +3453,7 @@ void simple_transaction_set(struct file *file, size_t n);
* All attributes contain a text representation of a numeric value
* that are accessed with the get() and set() functions.
*/
-#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \
+#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \
static int __fops ## _open(struct inode *inode, struct file *file) \
{ \
__simple_attr_check_format(__fmt, 0ull); \
@@ -3464,10 +3464,16 @@ static const struct file_operations __fops = { \
.open = __fops ## _open, \
.release = simple_attr_release, \
.read = simple_attr_read, \
- .write = simple_attr_write, \
+ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \
.llseek = generic_file_llseek, \
}
+#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \
+ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)
+
+#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \
+ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)
+
static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
@@ -3482,6 +3488,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos);
struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 6fbf49cc10e4..79e6a377062b 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -50,7 +50,8 @@ enum fs_context_phase {
*/
enum fs_value_type {
fs_value_is_undefined,
- fs_value_is_flag, /* Value not given a value */
+ fs_value_is_flag, /* Does not take a value */
+ fs_value_is_empty, /* Value is not given */
fs_value_is_string, /* Value is a string */
fs_value_is_blob, /* Value is a binary blob */
fs_value_is_filename, /* Value is a filename* + dirfd */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index d88c46ca82e1..5088637fe5c2 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t;
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
-#define ___GFP_ATOMIC 0x200u
+/* 0x200u unused */
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
@@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t;
*
* %__GFP_HIGH indicates that the caller is high-priority and that granting
* the request is necessary before the system can make forward progress.
- * For example, creating an IO context to clean pages.
- *
- * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- * high priority. Users are typically interrupt handlers. This may be
- * used in conjunction with %__GFP_HIGH
+ * For example creating an IO context to clean pages and requests
+ * from atomic context.
*
* %__GFP_MEMALLOC allows access to all memory. This should only be used when
* the caller guarantees the allocation will allow more memory to be freed
@@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t;
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
*/
-#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
@@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t;
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/
-#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index e9912da5441b..a32c64681f03 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -319,6 +319,30 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
#endif
+#ifdef copy_mc_to_kernel
+static inline int copy_mc_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ unsigned long ret;
+ char *vfrom, *vto;
+
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
+ ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
+
+ return ret;
+}
+#else
+static inline int copy_mc_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ copy_user_highpage(to, from, vaddr, vma);
+ return 0;
+}
+#endif
+
#ifndef __HAVE_ARCH_COPY_HIGHPAGE
static inline void copy_highpage(struct page *to, struct page *from)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a1341fdcf666..5d861905df46 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -16,12 +16,22 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma);
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
+
+static inline vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf,
+ pfn_t pfn, pgprot_t pgprot,
+ bool write)
+{
+ return VM_FAULT_SIGBUS;
+}
#endif
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
@@ -58,8 +68,6 @@ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
{
return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
}
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
/**
* vmf_insert_pfn_pud - insert a pud size pfn
@@ -258,10 +266,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
return folio_order(folio) >= HPAGE_PMD_ORDER;
}
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags, struct dev_pagemap **pgmap);
+ pud_t *pud, int flags);
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
@@ -420,14 +426,8 @@ static inline void mm_put_huge_zero_page(struct mm_struct *mm)
return;
}
-static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- return NULL;
-}
-
static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
- unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
+ unsigned long addr, pud_t *pud, int flags)
{
return NULL;
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fb4f27adca4f..badcb277603d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
struct page **, struct vm_area_struct **,
unsigned long *, unsigned long *, long, unsigned int,
@@ -183,10 +185,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
int isolate_hugetlb(struct page *page, struct list_head *list);
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison);
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared);
void putback_active_hugepage(struct page *page);
-void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
@@ -211,17 +214,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write);
-struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd,
- int flags, int pdshift);
-struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address,
- int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
- pgd_t *pgd, int flags);
void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
@@ -274,6 +266,12 @@ static inline void adjust_range_if_pmd_sharing_possible(
{
}
+static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
+}
+
static inline long follow_hugetlb_page(struct mm_struct *mm,
struct vm_area_struct *vma, struct page **pages,
struct vm_area_struct **vmas, unsigned long *position,
@@ -284,12 +282,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
return 0;
}
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
- unsigned long address, int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
@@ -322,31 +314,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
{
}
-static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags,
- int pdshift)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma,
- unsigned long address, int flags)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
- unsigned long address, pud_t *pud, int flags)
-{
- return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
- unsigned long address, pgd_t *pgd, int flags)
-{
- return NULL;
-}
-
static inline int prepare_hugepage_range(struct file *file,
unsigned long addr, unsigned long len)
{
@@ -427,12 +394,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list)
return -EBUSY;
}
-static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
return 0;
}
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
return 0;
}
@@ -441,8 +409,8 @@ static inline void putback_active_hugepage(struct page *page)
{
}
-static inline void move_hugetlb_state(struct page *oldpage,
- struct page *newpage, int reason)
+static inline void move_hugetlb_state(struct folio *old_folio,
+ struct folio *new_folio, int reason)
{
}
@@ -630,26 +598,50 @@ enum hugetlb_page_flags {
*/
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname) \
+static __always_inline \
+bool folio_test_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ return test_bit(HPG_##flname, private); \
+ } \
static inline int HPage##uname(struct page *page) \
{ return test_bit(HPG_##flname, &(page->private)); }
#define SETHPAGEFLAG(uname, flname) \
+static __always_inline \
+void folio_set_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ set_bit(HPG_##flname, private); \
+ } \
static inline void SetHPage##uname(struct page *page) \
{ set_bit(HPG_##flname, &(page->private)); }
#define CLEARHPAGEFLAG(uname, flname) \
+static __always_inline \
+void folio_clear_hugetlb_##flname(struct folio *folio) \
+ { void *private = &folio->private; \
+ clear_bit(HPG_##flname, private); \
+ } \
static inline void ClearHPage##uname(struct page *page) \
{ clear_bit(HPG_##flname, &(page->private)); }
#else
#define TESTHPAGEFLAG(uname, flname) \
+static inline bool \
+folio_test_hugetlb_##flname(struct folio *folio) \
+ { return 0; } \
static inline int HPage##uname(struct page *page) \
{ return 0; }
#define SETHPAGEFLAG(uname, flname) \
+static inline void \
+folio_set_hugetlb_##flname(struct folio *folio) \
+ { } \
static inline void SetHPage##uname(struct page *page) \
{ }
#define CLEARHPAGEFLAG(uname, flname) \
+static inline void \
+folio_clear_hugetlb_##flname(struct folio *folio) \
+ { } \
static inline void ClearHPage##uname(struct page *page) \
{ }
#endif
@@ -735,18 +727,29 @@ extern unsigned int default_hstate_idx;
#define default_hstate (hstates[default_hstate_idx])
+static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
+{
+ return (void *)folio_get_private_1(folio);
+}
+
/*
* hugetlb page subpool pointer located in hpage[1].private
*/
static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
{
- return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL);
+ return hugetlb_folio_subpool(page_folio(hpage));
+}
+
+static inline void hugetlb_set_folio_subpool(struct folio *folio,
+ struct hugepage_subpool *subpool)
+{
+ folio_set_private_1(folio, (unsigned long)subpool);
}
static inline void hugetlb_set_page_subpool(struct page *hpage,
struct hugepage_subpool *subpool)
{
- set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool);
+ hugetlb_set_folio_subpool(page_folio(hpage), subpool);
}
static inline struct hstate *hstate_file(struct file *f)
@@ -830,10 +833,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
}
#endif
+static inline struct hstate *folio_hstate(struct folio *folio)
+{
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ return size_to_hstate(folio_size(folio));
+}
+
static inline struct hstate *page_hstate(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return size_to_hstate(page_size(page));
+ return folio_hstate(page_folio(page));
}
static inline unsigned hstate_index_to_shift(unsigned index)
@@ -1042,6 +1050,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
return NULL;
}
+static inline struct hstate *folio_hstate(struct folio *folio)
+{
+ return NULL;
+}
+
static inline struct hstate *page_hstate(struct page *page)
{
return NULL;
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 630cd255d0cf..c70f92fe493e 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -67,54 +67,61 @@ struct hugetlb_cgroup {
};
static inline struct hugetlb_cgroup *
-__hugetlb_cgroup_from_page(struct page *page, bool rsvd)
+__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
+ struct page *tail;
- if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
- if (rsvd)
- return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD);
- else
- return (void *)page_private(page + SUBPAGE_INDEX_CGROUP);
+
+ if (rsvd) {
+ tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD);
+ return (void *)page_private(tail);
+ }
+
+ else {
+ tail = folio_page(folio, SUBPAGE_INDEX_CGROUP);
+ return (void *)page_private(tail);
+ }
}
-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
- return __hugetlb_cgroup_from_page(page, false);
+ return __hugetlb_cgroup_from_folio(folio, false);
}
static inline struct hugetlb_cgroup *
-hugetlb_cgroup_from_page_rsvd(struct page *page)
+hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
- return __hugetlb_cgroup_from_page(page, true);
+ return __hugetlb_cgroup_from_folio(folio, true);
}
-static inline void __set_hugetlb_cgroup(struct page *page,
+static inline void __set_hugetlb_cgroup(struct folio *folio,
struct hugetlb_cgroup *h_cg, bool rsvd)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
- if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
+ if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
return;
if (rsvd)
- set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
+ set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD),
(unsigned long)h_cg);
else
- set_page_private(page + SUBPAGE_INDEX_CGROUP,
+ set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP),
(unsigned long)h_cg);
}
-static inline void set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct folio *folio,
struct hugetlb_cgroup *h_cg)
{
- __set_hugetlb_cgroup(page, h_cg, false);
+ __set_hugetlb_cgroup(folio, h_cg, false);
}
-static inline void set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
struct hugetlb_cgroup *h_cg)
{
- __set_hugetlb_cgroup(page, h_cg, true);
+ __set_hugetlb_cgroup(folio, h_cg, true);
}
static inline bool hugetlb_cgroup_disabled(void)
@@ -151,10 +158,10 @@ extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page);
-extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page);
-extern void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
- struct page *page);
+extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio);
+extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
+ struct folio *folio);
extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg);
@@ -170,8 +177,8 @@ extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
bool region_del);
extern void hugetlb_cgroup_file_init(void) __init;
-extern void hugetlb_cgroup_migrate(struct page *oldhpage,
- struct page *newhpage);
+extern void hugetlb_cgroup_migrate(struct folio *old_folio,
+ struct folio *new_folio);
#else
static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
@@ -181,29 +188,23 @@ static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
{
}
-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
return NULL;
}
static inline struct hugetlb_cgroup *
-hugetlb_cgroup_from_page_resv(struct page *page)
+hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
return NULL;
}
-static inline struct hugetlb_cgroup *
-hugetlb_cgroup_from_page_rsvd(struct page *page)
-{
- return NULL;
-}
-
-static inline void set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct folio *folio,
struct hugetlb_cgroup *h_cg)
{
}
-static inline void set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
struct hugetlb_cgroup *h_cg)
{
}
@@ -253,14 +254,14 @@ hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
{
}
-static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
}
-static inline void hugetlb_cgroup_uncharge_page_rsvd(int idx,
+static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx,
unsigned long nr_pages,
- struct page *page)
+ struct folio *folio)
{
}
static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
@@ -285,8 +286,8 @@ static inline void hugetlb_cgroup_file_init(void)
{
}
-static inline void hugetlb_cgroup_migrate(struct page *oldhpage,
- struct page *newhpage)
+static inline void hugetlb_cgroup_migrate(struct folio *old_folio,
+ struct folio *new_folio)
{
}
diff --git a/include/linux/init.h b/include/linux/init.h
index 077d7f93b402..2e96756fe1ff 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -143,6 +143,7 @@ struct file_system_type;
extern int do_one_initcall(initcall_t fn);
extern char __initdata boot_command_line[];
extern char *saved_command_line;
+extern unsigned int saved_command_line_len;
extern unsigned int reset_devices;
/* used by init/main.c */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 41a686996aaa..5dd4343c1bbe 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -17,6 +17,7 @@
#include <linux/crash_core.h>
#include <asm/io.h>
+#include <linux/range.h>
#include <uapi/linux/kexec.h>
#include <linux/verification.h>
@@ -240,14 +241,10 @@ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
/* Alignment required for elf header segment */
#define ELF_CORE_HEADER_ALIGN 4096
-struct crash_mem_range {
- u64 start, end;
-};
-
struct crash_mem {
unsigned int max_nr_ranges;
unsigned int nr_ranges;
- struct crash_mem_range ranges[];
+ struct range ranges[];
};
extern int crash_exclude_mem_range(struct crash_mem *mem,
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 965009aa01d7..fc9647b1b4f9 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -18,7 +18,6 @@
* the same memory tier.
*/
#define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
-#define MEMTIER_HOTPLUG_PRIO 100
struct memory_tier;
struct memory_dev_type {
diff --git a/include/linux/memory.h b/include/linux/memory.h
index aa619464a1df..31343566c221 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -19,7 +19,6 @@
#include <linux/node.h>
#include <linux/compiler.h>
#include <linux/mutex.h>
-#include <linux/notifier.h>
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
@@ -85,6 +84,9 @@ struct memory_block {
unsigned long nr_vmemmap_pages;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+ atomic_long_t nr_hwpoison;
+#endif
};
int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -113,8 +115,13 @@ struct mem_section;
* Priorities for the hotplug memory callback routines (stored in decreasing
* order in the callback chain)
*/
-#define SLAB_CALLBACK_PRI 1
-#define IPC_CALLBACK_PRI 10
+#define DEFAULT_CALLBACK_PRI 0
+#define SLAB_CALLBACK_PRI 1
+#define HMAT_CALLBACK_PRI 2
+#define MM_COMPUTE_BATCH_PRI 10
+#define CPUSET_CALLBACK_PRI 10
+#define MEMTIER_HOTPLUG_PRI 100
+#define KSM_CALLBACK_PRI 100
#ifndef CONFIG_MEMORY_HOTPLUG
static inline void memory_dev_init(void)
@@ -136,9 +143,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
{
return 0;
}
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; })
-#define unregister_hotmemory_notifier(nb) ({ (void)(nb); })
#else /* CONFIG_MEMORY_HOTPLUG */
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
@@ -166,8 +170,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
{ .notifier_call = fn, .priority = pri };\
register_memory_notifier(&fn##_mem_nb); \
})
-#define register_hotmemory_notifier(nb) register_memory_notifier(nb)
-#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb)
#ifdef CONFIG_NUMA
void memory_block_add_nid(struct memory_block *mem, int nid,
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1314d9c5f05b..d3a4345339f9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -139,6 +139,28 @@ struct dev_pagemap {
};
};
+/*
+ * Do not use this in new code, this is a transitional helper on the
+ * path to convert all ZONE_DEVICE users to operate in terms of pgmap
+ * offsets rather than pfn and pfn_to_page() to put ZONE_DEVICE pages
+ * into use.
+ */
+static inline pgoff_t pfn_to_pgmap_offset(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+ u64 phys = PFN_PHYS(pfn), sum = 0;
+ int i;
+
+ for (i = 0; i < pgmap->nr_range; i++) {
+ struct range *range = &pgmap->ranges[i];
+
+ if (phys >= range->start && phys <= range->end)
+ return PHYS_PFN(phys - range->start + sum);
+ sum += range_len(range);
+ }
+
+ return -1;
+}
+
static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
return pgmap->ops && pgmap->ops->memory_failure;
@@ -187,13 +209,14 @@ static inline bool folio_is_device_coherent(const struct folio *folio)
}
#ifdef CONFIG_ZONE_DEVICE
-void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap);
+ struct dev_pagemap *pgmap);
+struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
+ pgoff_t pgmap_offset, int order);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
@@ -217,12 +240,24 @@ static inline void devm_memunmap_pages(struct device *dev,
{
}
+static inline struct dev_pagemap *
+get_dev_pagemap_many(unsigned long pfn, struct dev_pagemap *pgmap, int refs)
+{
+ return NULL;
+}
+
static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
struct dev_pagemap *pgmap)
{
return NULL;
}
+static inline struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
+ pgoff_t pgmap_offset, int order)
+{
+ return NULL;
+}
+
static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
return false;
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 5433c08fcc68..396df1121bff 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -37,6 +37,28 @@
__cmp(x, y, op), \
__cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
+#define __clamp(val, lo, hi) \
+ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
+
+#define __clamp_once(val, lo, hi, unique_val, unique_lo, unique_hi) ({ \
+ typeof(val) unique_val = (val); \
+ typeof(lo) unique_lo = (lo); \
+ typeof(hi) unique_hi = (hi); \
+ __clamp(unique_val, unique_lo, unique_hi); })
+
+#define __clamp_input_check(lo, hi) \
+ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \
+ __is_constexpr((lo) > (hi)), (lo) > (hi), false)))
+
+#define __careful_clamp(val, lo, hi) ({ \
+ __clamp_input_check(lo, hi) + \
+ __builtin_choose_expr(__typecheck(val, lo) && __typecheck(val, hi) && \
+ __typecheck(hi, lo) && __is_constexpr(val) && \
+ __is_constexpr(lo) && __is_constexpr(hi), \
+ __clamp(val, lo, hi), \
+ __clamp_once(val, lo, hi, __UNIQUE_ID(__val), \
+ __UNIQUE_ID(__lo), __UNIQUE_ID(__hi))); })
+
/**
* min - return minimum of two values of the same or compatible types
* @x: first value
@@ -86,7 +108,7 @@
* This macro does strict typechecking of @lo/@hi to make sure they are of the
* same type as @val. See the unnecessary pointer comparisons.
*/
-#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
+#define clamp(val, lo, hi) __careful_clamp(val, lo, hi)
/*
* ..and if you can't take the strict
@@ -121,7 +143,7 @@
* This macro does no typechecking and uses temporary variables of type
* @type to make all the comparisons.
*/
-#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi))
/**
* clamp_val - return a value clamped to a given range using val's type
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8bbcccbc5565..978c17df053e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count)
extern void * high_memory;
extern int page_cluster;
+extern const int page_cluster_max;
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
@@ -549,7 +550,7 @@ struct vm_operations_struct {
/*
* Called by mprotect() to make driver-specific permission
* checks before mprotect() is finalised. The VMA must not
- * be modified. Returns 0 if eprotect() can proceed.
+ * be modified. Returns 0 if mprotect() can proceed.
*/
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -1082,30 +1083,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
* back into memory.
*/
-#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
-DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
- if (!static_branch_unlikely(&devmap_managed_key))
- return false;
- if (!is_zone_device_page(page))
- return false;
- return __put_devmap_managed_page_refs(page, refs);
-}
-#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
-{
- return false;
-}
-#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-
-static inline bool put_devmap_managed_page(struct page *page)
-{
- return put_devmap_managed_page_refs(page, 1);
-}
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1202,12 +1179,6 @@ static inline void put_page(struct page *page)
{
struct folio *folio = page_folio(page);
- /*
- * For some devmap managed pages we need to catch refcount transition
- * from 2 to 1:
- */
- if (put_devmap_managed_page(&folio->page))
- return;
folio_put(folio);
}
@@ -2030,40 +2001,30 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- long val = atomic_long_read(&mm->rss_stat.count[member]);
-
-#ifdef SPLIT_RSS_COUNTING
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- val = 0;
-#endif
- return (unsigned long)val;
+ return percpu_counter_read_positive(&mm->rss_stat[member]);
}
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
+void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
- long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+ percpu_counter_add(&mm->rss_stat[member], value);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+ percpu_counter_inc(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
- long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+ percpu_counter_dec(&mm->rss_stat[member]);
- mm_trace_rss_stat(mm, member, count);
+ mm_trace_rss_stat(mm, member);
}
/* Optimized variant when page is already known not to be PageAnon */
@@ -2950,7 +2911,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
* and return without waiting upon it */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_ANON 0x8000 /* don't do file mappings */
@@ -3268,7 +3228,6 @@ enum mf_flags {
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
-extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
@@ -3277,12 +3236,42 @@ extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+extern void memory_failure_queue(unsigned long pfn, int flags);
+extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared);
+void num_poisoned_pages_inc(unsigned long pfn);
+void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+static inline void memory_failure_queue(unsigned long pfn, int flags)
+{
+}
+
+static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
return 0;
}
+
+static inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+}
+
+static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+}
+#endif
+
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
+extern void memblk_nr_poison_inc(unsigned long pfn);
+extern void memblk_nr_poison_sub(unsigned long pfn, long i);
+#else
+static inline void memblk_nr_poison_inc(unsigned long pfn)
+{
+}
+
+static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
+{
+}
#endif
#ifndef arch_memory_failure
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..834022721bc6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -18,6 +18,7 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/percpu_counter.h>
#include <asm/mmu.h>
@@ -144,6 +145,7 @@ struct page {
atomic_t compound_pincount;
#ifdef CONFIG_64BIT
unsigned int compound_nr; /* 1 << compound_order */
+ unsigned long _private_1;
#endif
};
struct { /* Second tail page of compound page */
@@ -264,6 +266,7 @@ struct page {
* @_total_mapcount: Do not use directly, call folio_entire_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
* @_folio_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_private_1: Do not use directly, call folio_get_private_1().
*
* A folio is a physically, virtually and logically contiguous set
* of bytes. It is a power-of-two in size, and it is aligned to that
@@ -311,6 +314,7 @@ struct folio {
#ifdef CONFIG_64BIT
unsigned int _folio_nr_pages;
#endif
+ unsigned long _private_1;
};
#define FOLIO_MATCH(pg, fl) \
@@ -338,6 +342,7 @@ FOLIO_MATCH(compound_mapcount, _total_mapcount);
FOLIO_MATCH(compound_pincount, _pincount);
#ifdef CONFIG_64BIT
FOLIO_MATCH(compound_nr, _folio_nr_pages);
+FOLIO_MATCH(_private_1, _private_1);
#endif
#undef FOLIO_MATCH
@@ -383,6 +388,16 @@ static inline void *folio_get_private(struct folio *folio)
return folio->private;
}
+static inline void folio_set_private_1(struct folio *folio, unsigned long private)
+{
+ folio->_private_1 = private;
+}
+
+static inline unsigned long folio_get_private_1(struct folio *folio)
+{
+ return folio->_private_1;
+}
+
struct page_frag_cache {
void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -612,11 +627,7 @@ struct mm_struct {
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
- /*
- * Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- struct mm_rss_stat rss_stat;
+ struct percpu_counter rss_stat[NR_MM_COUNTERS];
struct linux_binfmt *binfmt;
@@ -847,7 +858,6 @@ typedef __bitwise unsigned int vm_fault_t;
* @VM_FAULT_OOM: Out Of Memory
* @VM_FAULT_SIGBUS: Bad access
* @VM_FAULT_MAJOR: Page read from storage
- * @VM_FAULT_WRITE: Special case for get_user_pages
* @VM_FAULT_HWPOISON: Hit poisoned small page
* @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
* in upper bits
@@ -868,7 +878,6 @@ enum vm_fault_reason {
VM_FAULT_OOM = (__force vm_fault_t)0x000001,
VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
- VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
@@ -891,19 +900,18 @@ enum vm_fault_reason {
VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
#define VM_FAULT_RESULT_TRACE \
- { VM_FAULT_OOM, "OOM" }, \
- { VM_FAULT_SIGBUS, "SIGBUS" }, \
- { VM_FAULT_MAJOR, "MAJOR" }, \
- { VM_FAULT_WRITE, "WRITE" }, \
- { VM_FAULT_HWPOISON, "HWPOISON" }, \
- { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
- { VM_FAULT_SIGSEGV, "SIGSEGV" }, \
- { VM_FAULT_NOPAGE, "NOPAGE" }, \
- { VM_FAULT_LOCKED, "LOCKED" }, \
- { VM_FAULT_RETRY, "RETRY" }, \
- { VM_FAULT_FALLBACK, "FALLBACK" }, \
- { VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+ { (__force unsigned int) VM_FAULT_OOM, "OOM" }, \
+ { (__force unsigned int) VM_FAULT_SIGBUS, "SIGBUS" }, \
+ { (__force unsigned int) VM_FAULT_MAJOR, "MAJOR" }, \
+ { (__force unsigned int) VM_FAULT_HWPOISON, "HWPOISON" }, \
+ { (__force unsigned int) VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
+ { (__force unsigned int) VM_FAULT_SIGSEGV, "SIGSEGV" }, \
+ { (__force unsigned int) VM_FAULT_NOPAGE, "NOPAGE" }, \
+ { (__force unsigned int) VM_FAULT_LOCKED, "LOCKED" }, \
+ { (__force unsigned int) VM_FAULT_RETRY, "RETRY" }, \
+ { (__force unsigned int) VM_FAULT_FALLBACK, "FALLBACK" }, \
+ { (__force unsigned int) VM_FAULT_DONE_COW, "DONE_COW" }, \
+ { (__force unsigned int) VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 0bb4b6da9993..5414b5c6a103 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -36,19 +36,6 @@ enum {
NR_MM_COUNTERS
};
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
-/* per-thread cached information, */
-struct task_rss_stat {
- int events; /* for synchronization threshold */
- int count[NR_MM_COUNTERS];
-};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
-
-struct mm_rss_stat {
- atomic_long_t count[NR_MM_COUNTERS];
-};
-
struct page_frag {
struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bbccb4044222..b33ab86d5dca 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_HEAD 0x00000080
-#define FGP_ENTRY 0x00000100
-#define FGP_STABLE 0x00000200
+#define FGP_ENTRY 0x00000080
+#define FGP_STABLE 0x00000100
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
@@ -1102,7 +1101,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
-void delete_from_page_cache(struct page *page);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_page(struct page *old, struct page *new);
void delete_from_page_cache_batch(struct address_space *mapping,
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index f3fafb731ffd..959f52e5867d 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -27,6 +27,8 @@ struct mm_walk;
* "do page table walk over the current vma", returning
* a negative value means "abort current page table walk
* right now" and returning 1 means "skip the current vma"
+ * Note that this callback is not called when the caller
+ * passes in a single VMA as for walk_page_vma().
* @pre_vma: if set, called before starting walk on a non-null vma.
* @post_vma: if set, called after a walk on a non-null vma, provided
* that @pre_vma and the vma walk succeeded.
@@ -99,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd,
void *private);
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private);
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8ed5fba6d156..bde6c4c1f405 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -13,7 +13,6 @@
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
-#include <linux/gfp.h>
/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
diff --git a/include/linux/regset.h b/include/linux/regset.h
index a00765f0e8cf..9061266dd8de 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -275,15 +275,15 @@ static inline int user_regset_copyin(unsigned int *pos, unsigned int *count,
return 0;
}
-static inline int user_regset_copyin_ignore(unsigned int *pos,
- unsigned int *count,
- const void **kbuf,
- const void __user **ubuf,
- const int start_pos,
- const int end_pos)
+static inline void user_regset_copyin_ignore(unsigned int *pos,
+ unsigned int *count,
+ const void **kbuf,
+ const void __user **ubuf,
+ const int start_pos,
+ const int end_pos)
{
if (*count == 0)
- return 0;
+ return;
BUG_ON(*pos < start_pos);
if (end_pos < 0 || *pos < end_pos) {
unsigned int copy = (end_pos < 0 ? *count
@@ -295,7 +295,6 @@ static inline int user_regset_copyin_ignore(unsigned int *pos,
*pos += copy;
*count -= copy;
}
- return 0;
}
extern int regset_get(struct task_struct *target,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffb6eb55cd13..079d299fa465 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,9 +870,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
-#ifdef SPLIT_RSS_COUNTING
- struct task_rss_stat rss_stat;
-#endif
int exit_state;
int exit_code;
int exit_signal;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a18cf4b7c724..369d7799205d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
-void lru_note_cost_folio(struct folio *);
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void lru_cache_add(struct page *);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 86b95ccb81bb..3ba9bf56899d 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
#ifdef CONFIG_MEMORY_FAILURE
-extern atomic_long_t num_poisoned_pages __read_mostly;
-
/*
* Support for hardware poisoned pages
*/
@@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
-static inline void num_poisoned_pages_inc(void)
-{
- atomic_long_inc(&num_poisoned_pages);
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
- atomic_long_sub(i, &num_poisoned_pages);
-}
-
-#else /* CONFIG_MEMORY_FAILURE */
+#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
@@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
-
-static inline void num_poisoned_pages_inc(void)
-{
-}
-
-static inline void num_poisoned_pages_sub(long i)
-{
-}
-#endif /* CONFIG_MEMORY_FAILURE */
+#endif
static inline int non_swap_entry(swp_entry_t entry)
{
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 97b09fcf7e52..adc50cf7b969 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -9,7 +9,7 @@
DECLARE_EVENT_CLASS(dax_pmd_fault_class,
TP_PROTO(struct inode *inode, struct vm_fault *vmf,
- pgoff_t max_pgoff, int result),
+ pgoff_t max_pgoff, vm_fault_t result),
TP_ARGS(inode, vmf, max_pgoff, result),
TP_STRUCT__entry(
__field(unsigned long, ino)
@@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
__field(pgoff_t, max_pgoff)
__field(dev_t, dev)
__field(unsigned int, flags)
- __field(int, result)
+ __field(unsigned int, result)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
@@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
__entry->flags = vmf->flags;
__entry->pgoff = vmf->pgoff;
__entry->max_pgoff = max_pgoff;
- __entry->result = result;
+ __entry->result = (__force unsigned int) result;
),
TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start "
"%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s",
@@ -54,7 +54,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
#define DEFINE_PMD_FAULT_EVENT(name) \
DEFINE_EVENT(dax_pmd_fault_class, name, \
TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
- pgoff_t max_pgoff, int result), \
+ pgoff_t max_pgoff, vm_fault_t result), \
TP_ARGS(inode, vmf, max_pgoff, result))
DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
@@ -151,7 +151,7 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
DECLARE_EVENT_CLASS(dax_pte_fault_class,
- TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, vm_fault_t result),
TP_ARGS(inode, vmf, result),
TP_STRUCT__entry(
__field(unsigned long, ino)
@@ -160,7 +160,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class,
__field(pgoff_t, pgoff)
__field(dev_t, dev)
__field(unsigned int, flags)
- __field(int, result)
+ __field(unsigned int, result)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
@@ -169,7 +169,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class,
__entry->address = vmf->address;
__entry->flags = vmf->flags;
__entry->pgoff = vmf->pgoff;
- __entry->result = result;
+ __entry->result = (__force unsigned int) result;
),
TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s",
MAJOR(__entry->dev),
@@ -185,7 +185,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class,
#define DEFINE_PTE_FAULT_EVENT(name) \
DEFINE_EVENT(dax_pte_fault_class, name, \
- TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \
+ TP_PROTO(struct inode *inode, struct vm_fault *vmf, vm_fault_t result), \
TP_ARGS(inode, vmf, result))
DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 243073cfc29d..58688768ef0f 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -346,10 +346,9 @@ TRACE_MM_PAGES
TRACE_EVENT(rss_stat,
TP_PROTO(struct mm_struct *mm,
- int member,
- long count),
+ int member),
- TP_ARGS(mm, member, count),
+ TP_ARGS(mm, member),
TP_STRUCT__entry(
__field(unsigned int, mm_id)
@@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat,
__entry->mm_id = mm_ptr_to_hash(mm);
__entry->curr = !!(current->mm == mm);
__entry->member = member;
- __entry->size = (count << PAGE_SHIFT);
+ __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
+ << PAGE_SHIFT);
),
TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index e87cb2b80ed3..11524cda4a95 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -31,7 +31,6 @@
gfpflag_string(__GFP_HIGHMEM), \
gfpflag_string(GFP_DMA32), \
gfpflag_string(__GFP_HIGH), \
- gfpflag_string(__GFP_ATOMIC), \
gfpflag_string(__GFP_IO), \
gfpflag_string(__GFP_FS), \
gfpflag_string(__GFP_NOWARN), \
diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h
new file mode 100644
index 000000000000..ad4e02191f35
--- /dev/null
+++ b/include/trace/events/vmalloc.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vmalloc
+
+#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VMALLOC_H
+
+#include <linux/tracepoint.h>
+
+/**
+ * alloc_vmap_area - called when a new vmap allocation occurs
+ * @addr: an allocated address
+ * @size: a requested size
+ * @align: a requested alignment
+ * @vstart: a requested start range
+ * @vend: a requested end range
+ * @failed: an allocation failed or not
+ *
+ * This event is used for a debug purpose, it can give an extra
+ * information for a developer about how often it occurs and which
+ * parameters are passed for further validation.
+ */
+TRACE_EVENT(alloc_vmap_area,
+
+ TP_PROTO(unsigned long addr, unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend, int failed),
+
+ TP_ARGS(addr, size, align, vstart, vend, failed),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, addr)
+ __field(unsigned long, size)
+ __field(unsigned long, align)
+ __field(unsigned long, vstart)
+ __field(unsigned long, vend)
+ __field(int, failed)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->size = size;
+ __entry->align = align;
+ __entry->vstart = vstart;
+ __entry->vend = vend;
+ __entry->failed = failed;
+ ),
+
+ TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d",
+ __entry->addr, __entry->size, __entry->align,
+ __entry->vstart, __entry->vend, __entry->failed)
+);
+
+/**
+ * purge_vmap_area_lazy - called when vmap areas were lazily freed
+ * @start: purging start address
+ * @end: purging end address
+ * @npurged: numbed of purged vmap areas
+ *
+ * This event is used for a debug purpose. It gives some
+ * indication about start:end range and how many objects
+ * are released.
+ */
+TRACE_EVENT(purge_vmap_area_lazy,
+
+ TP_PROTO(unsigned long start, unsigned long end,
+ unsigned int npurged),
+
+ TP_ARGS(start, end, npurged),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, npurged)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->npurged = npurged;
+ ),
+
+ TP_printk("start=0x%lx end=0x%lx num_purged=%u",
+ __entry->start, __entry->end, __entry->npurged)
+);
+
+/**
+ * free_vmap_area_noflush - called when a vmap area is freed
+ * @va_start: a start address of VA
+ * @nr_lazy: number of current lazy pages
+ * @nr_lazy_max: number of maximum lazy pages
+ *
+ * This event is used for a debug purpose. It gives some
+ * indication about a VA that is released, number of current
+ * outstanding areas and a maximum allowed threshold before
+ * dropping all of them.
+ */
+TRACE_EVENT(free_vmap_area_noflush,
+
+ TP_PROTO(unsigned long va_start, unsigned long nr_lazy,
+ unsigned long nr_lazy_max),
+
+ TP_ARGS(va_start, nr_lazy, nr_lazy_max),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, va_start)
+ __field(unsigned long, nr_lazy)
+ __field(unsigned long, nr_lazy_max)
+ ),
+
+ TP_fast_assign(
+ __entry->va_start = va_start;
+ __entry->nr_lazy = nr_lazy;
+ __entry->nr_lazy_max = nr_lazy_max;
+ ),
+
+ TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu",
+ __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max)
+);
+
+#endif /* _TRACE_VMALLOC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/main.c b/init/main.c
index aa21add5f7c5..d213371cc067 100644
--- a/init/main.c
+++ b/init/main.c
@@ -145,7 +145,8 @@ void (*__initdata late_time_init)(void);
/* Untouched command line saved by arch-specific code. */
char __initdata boot_command_line[COMMAND_LINE_SIZE];
/* Untouched saved command line (eg. for /proc) */
-char *saved_command_line;
+char *saved_command_line __ro_after_init;
+unsigned int saved_command_line_len __ro_after_init;
/* Command line for parameter parsing */
static char *static_command_line;
/* Untouched extra command line */
@@ -667,6 +668,8 @@ static void __init setup_command_line(char *command_line)
strcpy(saved_command_line + len, extra_init_args);
}
}
+
+ saved_command_line_len = strlen(saved_command_line);
}
/*
@@ -1379,7 +1382,7 @@ static void __init do_initcall_level(int level, char *command_line)
static void __init do_initcalls(void)
{
int level;
- size_t len = strlen(saved_command_line) + 1;
+ size_t len = saved_command_line_len + 1;
char *command_line;
command_line = kzalloc(len, GFP_KERNEL);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b474289c15b8..3ea2e836e93e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3630,11 +3630,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3652,7 +3647,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/fork.c b/kernel/fork.c
index 89b8b6c08592..69b4bb42da33 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -756,7 +756,7 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
@@ -782,6 +782,8 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
@@ -791,6 +793,9 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1110,6 +1115,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
+ int i;
+
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1151,10 +1158,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ca2743f9c634..969e8f52f7da 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -561,23 +561,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 45637511e0de..dd5983010b7b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
{
int i, j;
unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
+ struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
diff --git a/kernel/panic.c b/kernel/panic.c
index da323209f583..75fe389e8814 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -744,8 +745,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 188c305aeb8b..71a4350ac601 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2125,6 +2125,7 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
},
{
.procname = "dirtytime_expire_seconds",
diff --git a/lib/Kconfig b/lib/Kconfig
index 9bbf8a4b2108..3d8382bbc331 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -528,14 +528,31 @@ config CPUMASK_OFFSTACK
them on the stack. This is a bit more expensive, but avoids
stack overflow.
+choice
+ prompt "Number of CPUs detection method"
+ default UNFORCE_NR_CPUS
+ depends on SMP && EXPERT
+ help
+ Select between boot-time and compile-time detection of number
+ of CPUs. If it's possible to provide exact number of CPUs at
+ compile-time, kernel code may be optimized better.
+ For general-purpose kernel, choose "boot time" option.
+
+config UNFORCE_NR_CPUS
+ bool "Set number of CPUs at boot time"
+ help
+ Choose it if you build general-purpose kernel and want to rely
+ on kernel to detect actual number of CPUs.
+
config FORCE_NR_CPUS
- bool "NR_CPUS is set to an actual number of CPUs"
- depends on SMP
- help
- Say Yes if you have NR_CPUS set to an actual number of possible
- CPUs in your system, not to a default value. This forces the core
- code to rely on compile-time value and optimize kernel routines
- better.
+ bool "Set number of CPUs at compile time"
+ help
+ Choose it if NR_CPUS corresponds to an actual number of
+ possible CPUs in your system. This forces the core code
+ to rely on compile-time value and optimize kernel routines
+ better.
+
+endchoice
config CPU_RMAP
bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8f26020bdeb4..3ff5fb841f3c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1965,7 +1965,6 @@ config FAIL_SUNRPC
config FAULT_INJECTION_STACKTRACE_FILTER
bool "stacktrace filter for fault-injection capabilities"
depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
- depends on !X86_64
select STACKTRACE
depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86
help
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index ca09b1cf8ee9..ba5b27962c34 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -181,7 +181,7 @@ config KASAN_VMALLOC
config KASAN_KUNIT_TEST
tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
- depends on KASAN && KUNIT
+ depends on KASAN && KUNIT && TRACEPOINTS
default KUNIT_ALL_TESTS
help
A KUnit-based KASAN test suite. Triggers different kinds of
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 337d797a7141..6f8e5dd1dcd0 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu)
struct debug_percpu_free *percpu_pool;
struct hlist_node *tmp;
struct debug_obj *obj;
+ unsigned long flags;
/* Remote access is safe as the CPU is dead already */
percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu);
@@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu)
hlist_del(&obj->node);
kmem_cache_free(obj_cache, obj);
}
+
+ raw_spin_lock_irqsave(&pool_lock, flags);
+ obj_pool_used -= percpu_pool->obj_free;
+ debug_objects_freed += percpu_pool->obj_free;
+ raw_spin_unlock_irqrestore(&pool_lock, flags);
+
percpu_pool->obj_free = 0;
return 0;
@@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void)
hlist_add_head(&obj->node, &objects);
}
+ debug_objects_allocated += i;
+
/*
* debug_objects_mem_init() is now called early that only one CPU is up
* and interrupts have been disabled, so it is safe to replace the
@@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void)
debug_objects_enabled = 0;
kmem_cache_destroy(obj_cache);
pr_warn("out of memory.\n");
+ return;
} else
debug_objects_selftest();
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 9f53408c545d..4b8fafce415c 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -74,7 +74,7 @@ static bool fail_stacktrace(struct fault_attr *attr)
int n, nr_entries;
bool found = (attr->require_start == 0 && attr->require_end == ULONG_MAX);
- if (depth == 0)
+ if (depth == 0 || (found && !attr->reject_start && !attr->reject_end))
return found;
nr_entries = stack_trace_save(entries, depth, 1);
@@ -105,10 +105,16 @@ static inline bool fail_stacktrace(struct fault_attr *attr)
bool should_fail(struct fault_attr *attr, ssize_t size)
{
+ bool stack_checked = false;
+
if (in_task()) {
unsigned int fail_nth = READ_ONCE(current->fail_nth);
if (fail_nth) {
+ if (!fail_stacktrace(attr))
+ return false;
+
+ stack_checked = true;
fail_nth--;
WRITE_ONCE(current->fail_nth, fail_nth);
if (!fail_nth)
@@ -128,6 +134,9 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
if (atomic_read(&attr->times) == 0)
return false;
+ if (!stack_checked && !fail_stacktrace(attr))
+ return false;
+
if (atomic_read(&attr->space) > size) {
atomic_sub(size, &attr->space);
return false;
@@ -142,9 +151,6 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
if (attr->probability <= get_random_u32_below(100))
return false;
- if (!fail_stacktrace(attr))
- return false;
-
fail:
fail_dump(attr);
@@ -223,10 +229,10 @@ struct dentry *fault_create_debugfs_attr(const char *name,
#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir,
&attr->stacktrace_depth);
- debugfs_create_ul("require-start", mode, dir, &attr->require_start);
- debugfs_create_ul("require-end", mode, dir, &attr->require_end);
- debugfs_create_ul("reject-start", mode, dir, &attr->reject_start);
- debugfs_create_ul("reject-end", mode, dir, &attr->reject_end);
+ debugfs_create_xul("require-start", mode, dir, &attr->require_start);
+ debugfs_create_xul("require-end", mode, dir, &attr->require_end);
+ debugfs_create_xul("reject-start", mode, dir, &attr->reject_start);
+ debugfs_create_xul("reject-end", mode, dir, &attr->reject_end);
#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */
attr->dname = dget(dir);
diff --git a/lib/llist.c b/lib/llist.c
index 7d78b736e8af..6e668fa5a2c6 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -26,10 +26,10 @@
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
struct llist_head *head)
{
- struct llist_node *first;
+ struct llist_node *first = READ_ONCE(head->first);
do {
- new_last->next = first = READ_ONCE(head->first);
+ new_last->next = first;
} while (!try_cmpxchg(&head->first, &first, new_first));
return !first;
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index e1743803c851..b9c5498298c2 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -6056,7 +6056,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min)
if (mas->index < min)
return NULL;
- /* Retries on dead nodes handled by mas_next_entry */
+ /* Retries on dead nodes handled by mas_prev_entry */
return mas_prev_entry(mas, min);
}
EXPORT_SYMBOL_GPL(mas_find);
diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c
index 21016b32d313..2b24ea6c9497 100644
--- a/lib/notifier-error-inject.c
+++ b/lib/notifier-error-inject.c
@@ -15,7 +15,7 @@ static int debugfs_errno_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set,
+DEFINE_SIMPLE_ATTRIBUTE_SIGNED(fops_errno, debugfs_errno_get, debugfs_errno_set,
"%lld\n");
static struct dentry *debugfs_create_errno(const char *name, umode_t mode,
diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index e592d48b1974..fe6705cfd780 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -146,7 +146,6 @@ int sprint_oid(const void *data, size_t datasize, char *buffer, size_t bufsize)
bufsize -= count;
while (v < end) {
- num = 0;
n = *v++;
if (!(n & 0x80)) {
num = n;
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 67e6f83fe0f8..1f7e00ae62d5 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -605,8 +605,11 @@ err_devmem:
static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
{
+ struct dev_pagemap *pgmap;
struct page *dpage = NULL;
struct page *rpage = NULL;
+ struct folio *folio;
+ unsigned long pfn;
/*
* For ZONE_DEVICE private type, this is a fake device so we allocate
@@ -632,7 +635,12 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
goto error;
}
- zone_device_page_init(dpage);
+ /* FIXME: Rework allocator to be pgmap offset based */
+ pgmap = dpage->pgmap;
+ pfn = page_to_pfn(dpage);
+ folio = pgmap_request_folio(pgmap, pfn_to_pgmap_offset(pgmap, pfn), 0);
+ WARN_ON_ONCE(dpage != &folio->page);
+ lock_page(dpage);
dpage->zone_device_data = rpage;
return dpage;
diff --git a/lib/test_printf.c b/lib/test_printf.c
index da5efc8b8543..0a0b0c0bb2b4 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -686,17 +686,17 @@ flags(void)
gfp = GFP_ATOMIC|__GFP_DMA;
test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp);
- gfp = __GFP_ATOMIC;
- test("__GFP_ATOMIC", "%pGg", &gfp);
+ gfp = __GFP_HIGH;
+ test("__GFP_HIGH", "%pGg", &gfp);
/* Any flags not translated by the table should remain numeric */
gfp = ~__GFP_BITS_MASK;
snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp);
test(cmp_buffer, "%pGg", &gfp);
- snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx",
+ snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx",
(unsigned long) gfp);
- gfp |= __GFP_ATOMIC;
+ gfp |= __GFP_HIGH;
test(cmp_buffer, "%pGg", &gfp);
kfree(cmp_buffer);
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 3e6b8ad73858..50d6b2ab3956 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
obj-y := core.o
obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS) += sysfs.o
+obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
-obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
-obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o
+obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 36d098d06c55..80d5937fe337 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -694,6 +694,111 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
}
+/*
+ * damos_skip_charged_region() - Check if the given region or starting part of
+ * it is already charged for the DAMOS quota.
+ * @t: The target of the region.
+ * @rp: The pointer to the region.
+ * @s: The scheme to be applied.
+ *
+ * If a quota of a scheme has exceeded in a quota charge window, the scheme's
+ * action would applied to only a part of the target access pattern fulfilling
+ * regions. To avoid applying the scheme action to only already applied
+ * regions, DAMON skips applying the scheme action to the regions that charged
+ * in the previous charge window.
+ *
+ * This function checks if a given region should be skipped or not for the
+ * reason. If only the starting part of the region has previously charged,
+ * this function splits the region into two so that the second one covers the
+ * area that not charged in the previous charge widnow and saves the second
+ * region in *rp and returns false, so that the caller can apply DAMON action
+ * to the second one.
+ *
+ * Return: true if the region should be entirely skipped, false otherwise.
+ */
+static bool damos_skip_charged_region(struct damon_target *t,
+ struct damon_region **rp, struct damos *s)
+{
+ struct damon_region *r = *rp;
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz_to_skip;
+
+ /* Skip previously charged regions */
+ if (quota->charge_target_from) {
+ if (t != quota->charge_target_from)
+ return true;
+ if (r == damon_last_region(t)) {
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return true;
+ }
+ if (quota->charge_addr_from &&
+ r->ar.end <= quota->charge_addr_from)
+ return true;
+
+ if (quota->charge_addr_from && r->ar.start <
+ quota->charge_addr_from) {
+ sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
+ r->ar.start, DAMON_MIN_REGION);
+ if (!sz_to_skip) {
+ if (damon_sz_region(r) <= DAMON_MIN_REGION)
+ return true;
+ sz_to_skip = DAMON_MIN_REGION;
+ }
+ damon_split_region_at(t, r, sz_to_skip);
+ r = damon_next_region(r);
+ *rp = r;
+ }
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ }
+ return false;
+}
+
+static void damos_update_stat(struct damos *s,
+ unsigned long sz_tried, unsigned long sz_applied)
+{
+ s->stat.nr_tried++;
+ s->stat.sz_tried += sz_tried;
+ if (sz_applied)
+ s->stat.nr_applied++;
+ s->stat.sz_applied += sz_applied;
+}
+
+static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz = damon_sz_region(r);
+ struct timespec64 begin, end;
+ unsigned long sz_applied = 0;
+
+ if (c->ops.apply_scheme) {
+ if (quota->esz && quota->charged_sz + sz > quota->esz) {
+ sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+ DAMON_MIN_REGION);
+ if (!sz)
+ goto update_stat;
+ damon_split_region_at(t, r, sz);
+ }
+ ktime_get_coarse_ts64(&begin);
+ sz_applied = c->ops.apply_scheme(c, t, r, s);
+ ktime_get_coarse_ts64(&end);
+ quota->total_charged_ns += timespec64_to_ns(&end) -
+ timespec64_to_ns(&begin);
+ quota->charged_sz += sz;
+ if (quota->esz && quota->charged_sz >= quota->esz) {
+ quota->charge_target_from = t;
+ quota->charge_addr_from = r->ar.end + 1;
+ }
+ }
+ if (s->action != DAMOS_STAT)
+ r->age = 0;
+
+update_stat:
+ damos_update_stat(s, sz, sz_applied);
+}
+
static void damon_do_apply_schemes(struct damon_ctx *c,
struct damon_target *t,
struct damon_region *r)
@@ -702,9 +807,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
- unsigned long sz = damon_sz_region(r);
- struct timespec64 begin, end;
- unsigned long sz_applied = 0;
if (!s->wmarks.activated)
continue;
@@ -713,70 +815,13 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- /* Skip previously charged regions */
- if (quota->charge_target_from) {
- if (t != quota->charge_target_from)
- continue;
- if (r == damon_last_region(t)) {
- quota->charge_target_from = NULL;
- quota->charge_addr_from = 0;
- continue;
- }
- if (quota->charge_addr_from &&
- r->ar.end <= quota->charge_addr_from)
- continue;
-
- if (quota->charge_addr_from && r->ar.start <
- quota->charge_addr_from) {
- sz = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, DAMON_MIN_REGION);
- if (!sz) {
- if (damon_sz_region(r) <=
- DAMON_MIN_REGION)
- continue;
- sz = DAMON_MIN_REGION;
- }
- damon_split_region_at(t, r, sz);
- r = damon_next_region(r);
- sz = damon_sz_region(r);
- }
- quota->charge_target_from = NULL;
- quota->charge_addr_from = 0;
- }
+ if (damos_skip_charged_region(t, &r, s))
+ continue;
if (!damos_valid_target(c, t, r, s))
continue;
- /* Apply the scheme */
- if (c->ops.apply_scheme) {
- if (quota->esz &&
- quota->charged_sz + sz > quota->esz) {
- sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- DAMON_MIN_REGION);
- if (!sz)
- goto update_stat;
- damon_split_region_at(t, r, sz);
- }
- ktime_get_coarse_ts64(&begin);
- sz_applied = c->ops.apply_scheme(c, t, r, s);
- ktime_get_coarse_ts64(&end);
- quota->total_charged_ns += timespec64_to_ns(&end) -
- timespec64_to_ns(&begin);
- quota->charged_sz += sz;
- if (quota->esz && quota->charged_sz >= quota->esz) {
- quota->charge_target_from = t;
- quota->charge_addr_from = r->ar.end + 1;
- }
- }
- if (s->action != DAMOS_STAT)
- r->age = 0;
-
-update_stat:
- s->stat.nr_tried++;
- s->stat.sz_tried += sz;
- if (sz_applied)
- s->stat.nr_applied++;
- s->stat.sz_applied += sz_applied;
+ damos_apply_scheme(c, t, r, s);
}
}
@@ -803,59 +848,64 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->esz = esz;
}
-static void kdamond_apply_schemes(struct damon_ctx *c)
+static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
{
+ struct damos_quota *quota = &s->quota;
struct damon_target *t;
- struct damon_region *r, *next_r;
- struct damos *s;
+ struct damon_region *r;
+ unsigned long cumulated_sz;
+ unsigned int score, max_score = 0;
- damon_for_each_scheme(s, c) {
- struct damos_quota *quota = &s->quota;
- unsigned long cumulated_sz;
- unsigned int score, max_score = 0;
+ if (!quota->ms && !quota->sz)
+ return;
- if (!s->wmarks.activated)
- continue;
+ /* New charge window starts */
+ if (time_after_eq(jiffies, quota->charged_from +
+ msecs_to_jiffies(quota->reset_interval))) {
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ s->stat.qt_exceeds++;
+ quota->total_charged_sz += quota->charged_sz;
+ quota->charged_from = jiffies;
+ quota->charged_sz = 0;
+ damos_set_effective_quota(quota);
+ }
- if (!quota->ms && !quota->sz)
- continue;
+ if (!c->ops.get_scheme_score)
+ return;
- /* New charge window starts */
- if (time_after_eq(jiffies, quota->charged_from +
- msecs_to_jiffies(
- quota->reset_interval))) {
- if (quota->esz && quota->charged_sz >= quota->esz)
- s->stat.qt_exceeds++;
- quota->total_charged_sz += quota->charged_sz;
- quota->charged_from = jiffies;
- quota->charged_sz = 0;
- damos_set_effective_quota(quota);
+ /* Fill up the score histogram */
+ memset(quota->histogram, 0, sizeof(quota->histogram));
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ if (!__damos_valid_target(r, s))
+ continue;
+ score = c->ops.get_scheme_score(c, t, r, s);
+ quota->histogram[score] += damon_sz_region(r);
+ if (score > max_score)
+ max_score = score;
}
+ }
- if (!c->ops.get_scheme_score)
- continue;
+ /* Set the min score limit */
+ for (cumulated_sz = 0, score = max_score; ; score--) {
+ cumulated_sz += quota->histogram[score];
+ if (cumulated_sz >= quota->esz || !score)
+ break;
+ }
+ quota->min_score = score;
+}
- /* Fill up the score histogram */
- memset(quota->histogram, 0, sizeof(quota->histogram));
- damon_for_each_target(t, c) {
- damon_for_each_region(r, t) {
- if (!__damos_valid_target(r, s))
- continue;
- score = c->ops.get_scheme_score(
- c, t, r, s);
- quota->histogram[score] += damon_sz_region(r);
- if (score > max_score)
- max_score = score;
- }
- }
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next_r;
+ struct damos *s;
- /* Set the min score limit */
- for (cumulated_sz = 0, score = max_score; ; score--) {
- cumulated_sz += quota->histogram[score];
- if (cumulated_sz >= quota->esz || !score)
- break;
- }
- quota->min_score = score;
+ damon_for_each_scheme(s, c) {
+ if (!s->wmarks.activated)
+ continue;
+
+ damos_adjust_quota(c, s);
}
damon_for_each_target(t, c) {
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index efbc2bda8b9c..2a532e3983df 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -8,10 +8,7 @@
#define pr_fmt(fmt) "damon-lru-sort: " fmt
#include <linux/damon.h>
-#include <linux/ioport.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
#include "modules-common.h"
@@ -237,38 +234,31 @@ static int damon_lru_sort_turn(bool on)
return 0;
}
-static struct delayed_work damon_lru_sort_timer;
-static void damon_lru_sort_timer_fn(struct work_struct *work)
-{
- static bool last_enabled;
- bool now_enabled;
-
- now_enabled = enabled;
- if (last_enabled != now_enabled) {
- if (!damon_lru_sort_turn(now_enabled))
- last_enabled = now_enabled;
- else
- enabled = last_enabled;
- }
-}
-static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
-
-static bool damon_lru_sort_initialized;
-
static int damon_lru_sort_enabled_store(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
- if (rc < 0)
- return rc;
+ err = strtobool(val, &enable);
+ if (err)
+ return err;
- if (!damon_lru_sort_initialized)
- return rc;
+ if (is_enabled == enable)
+ return 0;
- schedule_delayed_work(&damon_lru_sort_timer, 0);
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
- return 0;
+ err = damon_lru_sort_turn(enable);
+ if (err)
+ return err;
+
+set_param_out:
+ enabled = enable;
+ return err;
}
static const struct kernel_param_ops enabled_param_ops = {
@@ -314,29 +304,19 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
static int __init damon_lru_sort_init(void)
{
- ctx = damon_new_ctx();
- if (!ctx)
- return -ENOMEM;
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
- if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return -EINVAL;
- }
+ if (err)
+ return err;
ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
- target = damon_new_target();
- if (!target) {
- damon_destroy_ctx(ctx);
- return -ENOMEM;
- }
- damon_add_target(ctx, target);
-
- schedule_delayed_work(&damon_lru_sort_timer, 0);
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_lru_sort_turn(true);
- damon_lru_sort_initialized = true;
- return 0;
+ return err;
}
module_init(damon_lru_sort_init);
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
new file mode 100644
index 000000000000..b2381a8466ec
--- /dev/null
+++ b/mm/damon/modules-common.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#include <linux/damon.h>
+
+#include "modules-common.h"
+
+/*
+ * Allocate, set, and return a DAMON context for the physical address space.
+ * @ctxp: Pointer to save the point to the newly created context
+ * @targetp: Pointer to save the point to the newly created target
+ */
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp)
+{
+ struct damon_ctx *ctx;
+ struct damon_target *target;
+
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ *ctxp = ctx;
+ *targetp = target;
+ return 0;
+}
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 5a4921851d32..f49cdb417005 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -44,3 +44,6 @@
0400); \
module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \
0400);
+
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+ struct damon_target **targetp);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 162c9b1ca00f..e57604bec06d 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -8,10 +8,7 @@
#define pr_fmt(fmt) "damon-reclaim: " fmt
#include <linux/damon.h>
-#include <linux/ioport.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
#include "modules-common.h"
@@ -183,38 +180,31 @@ static int damon_reclaim_turn(bool on)
return 0;
}
-static struct delayed_work damon_reclaim_timer;
-static void damon_reclaim_timer_fn(struct work_struct *work)
-{
- static bool last_enabled;
- bool now_enabled;
-
- now_enabled = enabled;
- if (last_enabled != now_enabled) {
- if (!damon_reclaim_turn(now_enabled))
- last_enabled = now_enabled;
- else
- enabled = last_enabled;
- }
-}
-static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
-
-static bool damon_reclaim_initialized;
-
static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
+ bool is_enabled = enabled;
+ bool enable;
+ int err;
- if (rc < 0)
- return rc;
+ err = strtobool(val, &enable);
+ if (err)
+ return err;
- /* system_wq might not initialized yet */
- if (!damon_reclaim_initialized)
- return rc;
+ if (is_enabled == enable)
+ return 0;
- schedule_delayed_work(&damon_reclaim_timer, 0);
- return 0;
+ /* Called before init function. The function will handle this. */
+ if (!ctx)
+ goto set_param_out;
+
+ err = damon_reclaim_turn(enable);
+ if (err)
+ return err;
+
+set_param_out:
+ enabled = enable;
+ return err;
}
static const struct kernel_param_ops enabled_param_ops = {
@@ -256,29 +246,19 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
static int __init damon_reclaim_init(void)
{
- ctx = damon_new_ctx();
- if (!ctx)
- return -ENOMEM;
+ int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
- if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return -EINVAL;
- }
+ if (err)
+ return err;
ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
- target = damon_new_target();
- if (!target) {
- damon_destroy_ctx(ctx);
- return -ENOMEM;
- }
- damon_add_target(ctx, target);
-
- schedule_delayed_work(&damon_reclaim_timer, 0);
+ /* 'enabled' has set before this function, probably via command line */
+ if (enabled)
+ err = damon_reclaim_turn(true);
- damon_reclaim_initialized = true;
- return 0;
+ return err;
}
module_init(damon_reclaim_init);
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
new file mode 100644
index 000000000000..52bebf242f74
--- /dev/null
+++ b/mm/damon/sysfs-common.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+DEFINE_MUTEX(damon_sysfs_lock);
+
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max)
+{
+ struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+ GFP_KERNEL);
+
+ if (!range)
+ return NULL;
+ range->kobj = (struct kobject){};
+ range->min = min;
+ range->max = max;
+
+ return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 0, &min);
+ if (err)
+ return err;
+
+ range->min = min;
+ return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long max;
+ int err;
+
+ err = kstrtoul(buf, 0, &max);
+ if (err)
+ return err;
+
+ range->max = max;
+ return count;
+}
+
+void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+ &damon_sysfs_ul_range_min_attr.attr,
+ &damon_sysfs_ul_range_max_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+struct kobj_type damon_sysfs_ul_range_ktype = {
+ .release = damon_sysfs_ul_range_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ul_range_groups,
+};
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
new file mode 100644
index 000000000000..56e6a99e353b
--- /dev/null
+++ b/mm/damon/sysfs-common.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+
+extern struct mutex damon_sysfs_lock;
+
+struct damon_sysfs_ul_range {
+ struct kobject kobj;
+ unsigned long min;
+ unsigned long max;
+};
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max);
+void damon_sysfs_ul_range_release(struct kobject *kobj);
+
+extern struct kobj_type damon_sysfs_ul_range_ktype;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 9f1219a67e3f..082c55e68e0e 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -5,113 +5,11 @@
* Copyright (c) 2022 SeongJae Park <sj@kernel.org>
*/
-#include <linux/damon.h>
-#include <linux/kobject.h>
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
-static DEFINE_MUTEX(damon_sysfs_lock);
-
-/*
- * unsigned long range directory
- */
-
-struct damon_sysfs_ul_range {
- struct kobject kobj;
- unsigned long min;
- unsigned long max;
-};
-
-static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
- unsigned long min,
- unsigned long max)
-{
- struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
- GFP_KERNEL);
-
- if (!range)
- return NULL;
- range->kobj = (struct kobject){};
- range->min = min;
- range->max = max;
-
- return range;
-}
-
-static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
-
- return sysfs_emit(buf, "%lu\n", range->min);
-}
-
-static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
- unsigned long min;
- int err;
-
- err = kstrtoul(buf, 0, &min);
- if (err)
- return err;
-
- range->min = min;
- return count;
-}
-
-static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
- char *buf)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
-
- return sysfs_emit(buf, "%lu\n", range->max);
-}
-
-static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct damon_sysfs_ul_range *range = container_of(kobj,
- struct damon_sysfs_ul_range, kobj);
- unsigned long max;
- int err;
-
- err = kstrtoul(buf, 0, &max);
- if (err)
- return err;
-
- range->max = max;
- return count;
-}
-
-static void damon_sysfs_ul_range_release(struct kobject *kobj)
-{
- kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_ul_range_min_attr =
- __ATTR_RW_MODE(min, 0600);
-
-static struct kobj_attribute damon_sysfs_ul_range_max_attr =
- __ATTR_RW_MODE(max, 0600);
-
-static struct attribute *damon_sysfs_ul_range_attrs[] = {
- &damon_sysfs_ul_range_min_attr.attr,
- &damon_sysfs_ul_range_max_attr.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
-
-static struct kobj_type damon_sysfs_ul_range_ktype = {
- .release = damon_sysfs_ul_range_release,
- .sysfs_ops = &kobj_sysfs_ops,
- .default_groups = damon_sysfs_ul_range_groups,
-};
+#include "sysfs-common.h"
/*
* schemes/stats directory
@@ -1062,23 +960,12 @@ static struct kobj_type damon_sysfs_schemes_ktype = {
struct damon_sysfs_region {
struct kobject kobj;
- unsigned long start;
- unsigned long end;
+ struct damon_addr_range ar;
};
-static struct damon_sysfs_region *damon_sysfs_region_alloc(
- unsigned long start,
- unsigned long end)
+static struct damon_sysfs_region *damon_sysfs_region_alloc(void)
{
- struct damon_sysfs_region *region = kmalloc(sizeof(*region),
- GFP_KERNEL);
-
- if (!region)
- return NULL;
- region->kobj = (struct kobject){};
- region->start = start;
- region->end = end;
- return region;
+ return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL);
}
static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1087,7 +974,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- return sysfs_emit(buf, "%lu\n", region->start);
+ return sysfs_emit(buf, "%lu\n", region->ar.start);
}
static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1095,7 +982,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- int err = kstrtoul(buf, 0, &region->start);
+ int err = kstrtoul(buf, 0, &region->ar.start);
return err ? err : count;
}
@@ -1106,7 +993,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- return sysfs_emit(buf, "%lu\n", region->end);
+ return sysfs_emit(buf, "%lu\n", region->ar.end);
}
static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1114,7 +1001,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_region *region = container_of(kobj,
struct damon_sysfs_region, kobj);
- int err = kstrtoul(buf, 0, &region->end);
+ int err = kstrtoul(buf, 0, &region->ar.end);
return err ? err : count;
}
@@ -1187,7 +1074,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
regions->regions_arr = regions_arr;
for (i = 0; i < nr_regions; i++) {
- region = damon_sysfs_region_alloc(0, 0);
+ region = damon_sysfs_region_alloc();
if (!region) {
damon_sysfs_regions_rm_dirs(regions);
return -ENOMEM;
@@ -2147,11 +2034,11 @@ static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_region *sys_region =
sysfs_regions->regions_arr[i];
- if (sys_region->start > sys_region->end)
+ if (sys_region->ar.start > sys_region->ar.end)
goto out;
- ranges[i].start = sys_region->start;
- ranges[i].end = sys_region->end;
+ ranges[i].start = sys_region->ar.start;
+ ranges[i].end = sys_region->ar.end;
if (i == 0)
continue;
if (ranges[i - 1].end > ranges[i].start)
@@ -2317,6 +2204,25 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
mutex_unlock(&ctx->kdamond_lock);
}
+static void damon_sysfs_schemes_update_stats(
+ struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx)
+{
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_stats *sysfs_stats;
+
+ sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+ sysfs_stats->nr_tried = scheme->stat.nr_tried;
+ sysfs_stats->sz_tried = scheme->stat.sz_tried;
+ sysfs_stats->nr_applied = scheme->stat.nr_applied;
+ sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ }
+}
+
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
* @kdamond: The kobject wrapper that associated to the kdamond thread.
@@ -2329,23 +2235,11 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
{
struct damon_ctx *ctx = kdamond->damon_ctx;
- struct damon_sysfs_schemes *sysfs_schemes;
- struct damos *scheme;
- int schemes_idx = 0;
if (!ctx)
return -EINVAL;
- sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
- damon_for_each_scheme(scheme, ctx) {
- struct damon_sysfs_stats *sysfs_stats;
-
- sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
- sysfs_stats->nr_tried = scheme->stat.nr_tried;
- sysfs_stats->sz_tried = scheme->stat.sz_tried;
- sysfs_stats->nr_applied = scheme->stat.nr_applied;
- sysfs_stats->sz_applied = scheme->stat.sz_applied;
- sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
- }
+ damon_sysfs_schemes_update_stats(
+ kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index dc7df1254f0a..2b61fde8c38c 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -38,11 +38,7 @@
* Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
* expectations that are being validated here. All future changes in here
* or the documentation need to be in sync.
- */
-
-#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
-
-/*
+ *
* On s390 platform, the lower 4 bits are used to identify given page table
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
@@ -1125,7 +1121,7 @@ static int __init init_args(struct pgtable_debug_args *args)
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
- args->page_prot = vm_get_page_prot(VMFLAGS);
+ args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS);
args->page_prot_none = vm_get_page_prot(VM_NONE);
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
diff --git a/mm/filemap.c b/mm/filemap.c
index 08341616ae7a..65eee6ec1066 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2048,10 +2048,10 @@ reset:
*
* Return: The number of entries which were found.
*/
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
@@ -2062,6 +2062,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
@@ -2085,16 +2094,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
*
* Return: The number of entries which were found.
*/
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
- XA_STATE(xas, &mapping->i_pages, start);
+ XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
if (!xa_is_value(folio)) {
- if (folio->index < start)
+ if (folio->index < *start)
goto put;
if (folio->index + folio_nr_pages(folio) - 1 > end)
goto put;
@@ -2117,6 +2126,15 @@ put:
}
rcu_read_unlock();
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr = 1;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ folio = fbatch->folios[idx];
+ if (!xa_is_value(folio) && !folio_test_hugetlb(folio))
+ nr = folio_nr_pages(folio);
+ *start = indices[idx] + nr;
+ }
return folio_batch_count(fbatch);
}
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index e1e23b4947d7..bac2a366aada 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -108,7 +108,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
struct folio *folio;
folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
- if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+ if (!folio || xa_is_value(folio))
return &folio->page;
return folio_file_page(folio, index);
}
@@ -124,11 +124,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-void delete_from_page_cache(struct page *page)
-{
- return filemap_remove_folio(page_folio(page));
-}
-
int try_to_release_page(struct page *page, gfp_t gfp)
{
return filemap_release_folio(page_folio(page), gfp);
diff --git a/mm/gup.c b/mm/gup.c
index fe195d47de74..5182abaaecde 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -25,7 +25,6 @@
#include "internal.h"
struct follow_page_context {
- struct dev_pagemap *pgmap;
unsigned int page_mask;
};
@@ -87,8 +86,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_page_refs(&folio->page, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -184,8 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_page_refs(&folio->page, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
}
/**
@@ -524,8 +521,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags,
- struct dev_pagemap **pgmap)
+ unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
@@ -537,42 +533,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-
- /*
- * Considering PTE level hugetlb, like continuous-PTE hugetlb on
- * ARM64 architecture.
- */
- if (is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
-retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
+ if (!pte_present(pte))
+ goto no_page;
if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto no_page;
@@ -588,17 +555,13 @@ retry:
goto out;
}
- if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
+ if (!page && pte_devmap(pte)) {
/*
- * Only return device mapping pages in the FOLL_GET or FOLL_PIN
- * case since they are only valid while holding the pgmap
- * reference.
+ * ZONE_DEVICE pages are not yet treated as vm_normal_page()
+ * instances, with respect to mapcount and compound-page
+ * metadata
*/
- *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
- if (*pgmap)
- page = pte_page(pte);
- else
- goto no_page;
+ page = pte_page(pte);
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
@@ -680,71 +643,22 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd_pte(vma, address, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(pmdval)), flags,
- PMD_SHIFT);
- if (page)
- return page;
+ if (!pmd_present(pmdval))
return no_page_table(vma, flags);
- }
-retry:
- if (!pmd_present(pmdval)) {
- /*
- * Should never reach here, if thp migration is not supported;
- * Otherwise, it must be a thp migration entry.
- */
- VM_BUG_ON(!thp_migration_supported() ||
- !is_pmd_migration_entry(pmdval));
-
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
-
- pmd_migration_entry_wait(mm, pmd);
- pmdval = READ_ONCE(*pmd);
- /*
- * MADV_DONTNEED may convert the pmd to null because
- * mmap_lock is held in read mode
- */
- if (pmd_none(pmdval))
- return no_page_table(vma, flags);
- goto retry;
- }
- if (pmd_devmap(pmdval)) {
- ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
- spin_unlock(ptl);
- if (page)
- return page;
- }
- if (likely(!pmd_trans_huge(pmdval)))
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ if (likely(!(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))))
+ return follow_page_pte(vma, address, pmd, flags);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
return no_page_table(vma, flags);
-retry_locked:
ptl = pmd_lock(mm, pmd);
- if (unlikely(pmd_none(*pmd))) {
- spin_unlock(ptl);
- return no_page_table(vma, flags);
- }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- pmd_migration_entry_wait(mm, pmd);
- goto retry_locked;
+ return no_page_table(vma, flags);
}
- if (unlikely(!pmd_trans_huge(*pmd))) {
+ if (unlikely(!(pmd_trans_huge(*pmd) || pmd_devmap(pmdval)))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags);
}
if (flags & FOLL_SPLIT_PMD) {
int ret;
@@ -762,7 +676,7 @@ retry_locked:
}
return ret ? ERR_PTR(ret) :
- follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ follow_page_pte(vma, address, pmd, flags);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
@@ -783,23 +697,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pud(mm, address, pud, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pud_val(*pud)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pud_val(*pud)), flags,
- PUD_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
+ page = follow_devmap_pud(vma, address, pud, flags);
spin_unlock(ptl);
if (page)
return page;
@@ -816,7 +716,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
struct follow_page_context *ctx)
{
p4d_t *p4d;
- struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
@@ -825,14 +724,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
- if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(p4d_val(*p4d)), flags,
- P4D_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
@@ -846,9 +737,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
- * the device's dev_pagemap metadata to avoid repeating expensive lookups.
- *
* When getting an anonymous page and the caller has to trigger unsharing
* of a shared anonymous page first, -EMLINK is returned. The caller should
* trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
@@ -870,10 +758,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
ctx->page_mask = 0;
- /* make this handle hugepd */
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+ /*
+ * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+ * special hugetlb page table walking code. This eliminates the
+ * need to check for hugetlb entries in the general walking code.
+ *
+ * hugetlb_follow_page_mask is only for follow_page() handling here.
+ * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = hugetlb_follow_page_mask(vma, address, flags);
+ if (!page)
+ page = no_page_table(vma, flags);
return page;
}
@@ -882,28 +778,13 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
- if (pgd_huge(*pgd)) {
- page = follow_huge_pgd(mm, address, pgd, flags);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
- if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
- page = follow_huge_pd(vma, address,
- __hugepd(pgd_val(*pgd)), flags,
- PGDIR_SHIFT);
- if (page)
- return page;
- return no_page_table(vma, flags);
- }
-
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags)
{
- struct follow_page_context ctx = { NULL };
+ struct follow_page_context ctx = { 0 };
struct page *page;
if (vma_is_secretmem(vma))
@@ -913,8 +794,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return NULL;
page = follow_page_mask(vma, address, foll_flags, &ctx);
- if (ctx.pgmap)
- put_dev_pagemap(ctx.pgmap);
return page;
}
@@ -1163,7 +1042,7 @@ static long __get_user_pages(struct mm_struct *mm,
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
- struct follow_page_context ctx = { NULL };
+ struct follow_page_context ctx = { 0 };
if (!nr_pages)
return 0;
@@ -1278,8 +1157,6 @@ next_page:
nr_pages -= page_increm;
} while (nr_pages);
out:
- if (ctx.pgmap)
- put_dev_pagemap(ctx.pgmap);
return i ? i : ret;
}
@@ -2422,9 +2299,8 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
+ int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
@@ -2441,12 +2317,6 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (pte_devmap(pte)) {
if (unlikely(flags & FOLL_LONGTERM))
goto pte_unmap;
-
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
- goto pte_unmap;
- }
} else if (pte_special(pte))
goto pte_unmap;
@@ -2494,8 +2364,6 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
ret = 1;
pte_unmap:
- if (pgmap)
- put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
}
@@ -2523,28 +2391,17 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
do {
struct page *page = pfn_to_page(pfn);
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ if (unlikely(!try_grab_page(page, flags)))
break;
- }
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
- put_dev_pagemap(pgmap);
return addr == end;
}
@@ -2553,16 +2410,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
struct page **pages, int *nr)
{
unsigned long fault_pfn;
- int nr_start = *nr;
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
- if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ if (unlikely(pmd_val(orig) != pmd_val(*pmdp)))
return 0;
- }
+
return 1;
}
@@ -2571,16 +2426,13 @@ static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page **pages, int *nr)
{
unsigned long fault_pfn;
- int nr_start = *nr;
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
- if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ if (unlikely(pud_val(orig) != pud_val(*pudp)))
return 0;
- }
return 1;
}
#else
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 12b0a91767d3..0d76d9b4bb5a 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -203,6 +203,135 @@ free_pages:
return ret;
}
+static DEFINE_MUTEX(pin_longterm_test_mutex);
+static struct page **pin_longterm_test_pages;
+static unsigned long pin_longterm_test_nr_pages;
+
+static inline void pin_longterm_test_stop(void)
+{
+ if (pin_longterm_test_pages) {
+ if (pin_longterm_test_nr_pages)
+ unpin_user_pages(pin_longterm_test_pages,
+ pin_longterm_test_nr_pages);
+ kfree(pin_longterm_test_pages);
+ pin_longterm_test_pages = NULL;
+ pin_longterm_test_nr_pages = 0;
+ }
+}
+
+static inline int pin_longterm_test_start(unsigned long arg)
+{
+ long nr_pages, cur_pages, addr, remaining_pages;
+ int gup_flags = FOLL_LONGTERM;
+ struct pin_longterm_test args;
+ struct page **pages;
+ int ret = 0;
+ bool fast;
+
+ if (pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&args, (void __user *)arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.flags &
+ ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST))
+ return -EINVAL;
+ if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE))
+ return -EINVAL;
+ if (args.size > LONG_MAX)
+ return -EINVAL;
+ nr_pages = args.size / PAGE_SIZE;
+ if (!nr_pages)
+ return -EINVAL;
+
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE)
+ gup_flags |= FOLL_WRITE;
+ fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST);
+
+ if (!fast && mmap_read_lock_killable(current->mm)) {
+ kfree(pages);
+ return -EINTR;
+ }
+
+ pin_longterm_test_pages = pages;
+ pin_longterm_test_nr_pages = 0;
+
+ while (nr_pages - pin_longterm_test_nr_pages) {
+ remaining_pages = nr_pages - pin_longterm_test_nr_pages;
+ addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE;
+
+ if (fast)
+ cur_pages = pin_user_pages_fast(addr, remaining_pages,
+ gup_flags, pages);
+ else
+ cur_pages = pin_user_pages(addr, remaining_pages,
+ gup_flags, pages, NULL);
+ if (cur_pages < 0) {
+ pin_longterm_test_stop();
+ ret = cur_pages;
+ break;
+ }
+ pin_longterm_test_nr_pages += cur_pages;
+ pages += cur_pages;
+ }
+
+ if (!fast)
+ mmap_read_unlock(current->mm);
+ return ret;
+}
+
+static inline int pin_longterm_test_read(unsigned long arg)
+{
+ __u64 user_addr;
+ unsigned long i;
+
+ if (!pin_longterm_test_pages)
+ return -EINVAL;
+
+ if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr)))
+ return -EFAULT;
+
+ for (i = 0; i < pin_longterm_test_nr_pages; i++) {
+ void *addr = page_to_virt(pin_longterm_test_pages[i]);
+
+ if (copy_to_user((void __user *)(unsigned long)user_addr, addr,
+ PAGE_SIZE))
+ return -EFAULT;
+ user_addr += PAGE_SIZE;
+ }
+ return 0;
+}
+
+static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+
+ if (mutex_lock_killable(&pin_longterm_test_mutex))
+ return -EINTR;
+
+ switch (cmd) {
+ case PIN_LONGTERM_TEST_START:
+ ret = pin_longterm_test_start(arg);
+ break;
+ case PIN_LONGTERM_TEST_STOP:
+ pin_longterm_test_stop();
+ ret = 0;
+ break;
+ case PIN_LONGTERM_TEST_READ:
+ ret = pin_longterm_test_read(arg);
+ break;
+ }
+
+ mutex_unlock(&pin_longterm_test_mutex);
+ return ret;
+}
+
static long gup_test_ioctl(struct file *filep, unsigned int cmd,
unsigned long arg)
{
@@ -217,6 +346,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
case PIN_BASIC_TEST:
case DUMP_USER_PAGES_TEST:
break;
+ case PIN_LONGTERM_TEST_START:
+ case PIN_LONGTERM_TEST_STOP:
+ case PIN_LONGTERM_TEST_READ:
+ return pin_longterm_test_ioctl(filep, cmd, arg);
default:
return -EINVAL;
}
@@ -234,9 +367,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd,
return 0;
}
+static int gup_test_release(struct inode *inode, struct file *file)
+{
+ pin_longterm_test_stop();
+
+ return 0;
+}
+
static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
.unlocked_ioctl = gup_test_ioctl,
+ .release = gup_test_release,
};
static int __init gup_test_init(void)
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 887ac1d5f5bc..5b37b54e8bea 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -10,6 +10,9 @@
#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test)
+#define PIN_LONGTERM_TEST_STOP _IO('g', 8)
+#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64)
#define GUP_TEST_MAX_PAGES_TO_DUMP 8
@@ -30,4 +33,13 @@ struct gup_test {
__u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
};
+#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1
+#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2
+
+struct pin_longterm_test {
+ __u64 addr;
+ __u64 size;
+ __u32 flags;
+};
+
#endif /* __GUP_TEST_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 561a42567477..1ed245f6d1a7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1029,49 +1029,6 @@ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
- unsigned long pfn = pmd_pfn(*pmd);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
-
- assert_spin_locked(pmd_lockptr(mm, pmd));
-
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
- return NULL;
-
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
- return NULL;
-
- if (pmd_present(*pmd) && pmd_devmap(*pmd))
- /* pass */;
- else
- return NULL;
-
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
- /*
- * device mapped pages can only be returned if the
- * caller will manage the page reference count.
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
- page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
-
- return page;
-}
-
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1188,7 +1145,7 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags, struct dev_pagemap **pgmap)
+ pud_t *pud, int flags)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
@@ -1222,9 +1179,6 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
if (!try_grab_page(page, flags))
page = ERR_PTR(-ENOMEM);
@@ -1379,7 +1333,7 @@ reuse:
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- return VM_FAULT_WRITE;
+ return 0;
}
unlock_fallback:
@@ -2712,7 +2666,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
- ret = -EBUSY;
+ ret = -EAGAIN;
goto out_unlock;
}
@@ -2762,7 +2716,7 @@ fail:
xas_unlock(&xas);
local_irq_enable();
remap_page(folio, folio_nr_pages(folio));
- ret = -EBUSY;
+ ret = -EAGAIN;
}
out_unlock:
@@ -3066,28 +3020,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
mapping = candidate->f_mapping;
for (index = off_start; index < off_end; index += nr_pages) {
- struct page *fpage = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index,
+ FGP_ENTRY, 0);
nr_pages = 1;
- if (xa_is_value(fpage) || !fpage)
+ if (xa_is_value(folio) || !folio)
continue;
- if (!is_transparent_hugepage(fpage))
+ if (!folio_test_large(folio))
goto next;
total++;
- nr_pages = thp_nr_pages(fpage);
+ nr_pages = folio_nr_pages(folio);
- if (!trylock_page(fpage))
+ if (!folio_trylock(folio))
goto next;
- if (!split_huge_page(fpage))
+ if (!split_folio(folio))
split++;
- unlock_page(fpage);
+ folio_unlock(folio);
next:
- put_page(fpage);
+ folio_put(folio);
cond_resched();
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 46637872ac40..4a8c8456555e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1446,9 +1446,10 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page,
bool demote)
{
int nid = page_to_nid(page);
+ struct folio *folio = page_folio(page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
+ VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
@@ -1703,21 +1704,22 @@ void free_huge_page(struct page *page)
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- struct hugepage_subpool *spool = hugetlb_page_subpool(page);
+ struct folio *folio = page_folio(page);
+ struct hstate *h = folio_hstate(folio);
+ int nid = folio_nid(folio);
+ struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
bool restore_reserve;
unsigned long flags;
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(page_mapcount(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
- hugetlb_set_page_subpool(page, NULL);
- if (PageAnon(page))
- __ClearPageAnonExclusive(page);
- page->mapping = NULL;
- restore_reserve = HPageRestoreReserve(page);
- ClearHPageRestoreReserve(page);
+ hugetlb_set_folio_subpool(folio, NULL);
+ if (folio_test_anon(folio))
+ __ClearPageAnonExclusive(&folio->page);
+ folio->mapping = NULL;
+ restore_reserve = folio_test_hugetlb_restore_reserve(folio);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
* If HPageRestoreReserve was set on page, page allocation consumed a
@@ -1739,15 +1741,15 @@ void free_huge_page(struct page *page)
}
spin_lock_irqsave(&hugetlb_lock, flags);
- ClearHPageMigratable(page);
- hugetlb_cgroup_uncharge_page(hstate_index(h),
- pages_per_huge_page(h), page);
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ folio_clear_hugetlb_migratable(folio);
+ hugetlb_cgroup_uncharge_folio(hstate_index(h),
+ pages_per_huge_page(h), folio);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
if (restore_reserve)
h->resv_huge_pages++;
- if (HPageTemporary(page)) {
+ if (folio_test_hugetlb_temporary(folio)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page, true);
@@ -1773,19 +1775,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct hstate *h, struct page *page)
+static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- hugetlb_vmemmap_optimize(h, page);
- INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- hugetlb_set_page_subpool(page, NULL);
- set_hugetlb_cgroup(page, NULL);
- set_hugetlb_cgroup_rsvd(page, NULL);
+ hugetlb_vmemmap_optimize(h, &folio->page);
+ INIT_LIST_HEAD(&folio->lru);
+ folio->_folio_dtor = HUGETLB_PAGE_DTOR;
+ hugetlb_set_folio_subpool(folio, NULL);
+ set_hugetlb_cgroup(folio, NULL);
+ set_hugetlb_cgroup_rsvd(folio, NULL);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
- __prep_new_huge_page(h, page);
+ struct folio *folio = page_folio(page);
+
+ __prep_new_hugetlb_folio(h, folio);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
@@ -2745,8 +2749,10 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
struct list_head *list)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- int nid = page_to_nid(old_page);
+ struct folio *old_folio = page_folio(old_page);
+ int nid = folio_nid(old_folio);
struct page *new_page;
+ struct folio *new_folio;
int ret = 0;
/*
@@ -2759,16 +2765,17 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
- __prep_new_huge_page(h, new_page);
+ new_folio = page_folio(new_page);
+ __prep_new_hugetlb_folio(h, new_folio);
retry:
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(old_page)) {
+ if (!folio_test_hugetlb(old_folio)) {
/*
* Freed from under us. Drop new_page too.
*/
goto free_new;
- } else if (page_count(old_page)) {
+ } else if (folio_ref_count(old_folio)) {
/*
* Someone has grabbed the page, try to isolate it here.
* Fail with -EBUSY if not possible.
@@ -2777,7 +2784,7 @@ retry:
ret = isolate_hugetlb(old_page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
- } else if (!HPageFreed(old_page)) {
+ } else if (!folio_test_hugetlb(old_folio)) {
/*
* Page's refcount is 0 but it has not been enqueued in the
* freelist yet. Race window is small, so we can succeed here if
@@ -2815,7 +2822,7 @@ retry:
free_new:
spin_unlock_irq(&hugetlb_lock);
/* Page has a zero ref count, but needs a ref to be freed */
- set_page_refcounted(new_page);
+ folio_ref_unfreeze(new_folio, 1);
update_and_free_page(h, new_page, false);
return ret;
@@ -2824,7 +2831,7 @@ free_new:
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
{
struct hstate *h;
- struct page *head;
+ struct folio *folio = page_folio(page);
int ret = -EBUSY;
/*
@@ -2833,9 +2840,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
* Return success when racing as if we dissolved the page ourselves.
*/
spin_lock_irq(&hugetlb_lock);
- if (PageHuge(page)) {
- head = compound_head(page);
- h = page_hstate(head);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
} else {
spin_unlock_irq(&hugetlb_lock);
return 0;
@@ -2850,10 +2856,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (page_count(head) && !isolate_hugetlb(head, list))
+ if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
ret = 0;
- else if (!page_count(head))
- ret = alloc_and_dissolve_huge_page(h, head, list);
+ else if (!folio_ref_count(folio))
+ ret = alloc_and_dissolve_huge_page(h, &folio->page, list);
return ret;
}
@@ -2864,6 +2870,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
+ struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
@@ -2927,6 +2934,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
* a reservation exists for the allocation.
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
+
if (!page) {
spin_unlock_irq(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
@@ -2941,6 +2949,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_refcounted(page);
/* Fall through */
}
+ folio = page_folio(page);
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
@@ -2970,8 +2979,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
- hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
- pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
+ pages_per_huge_page(h), folio);
}
return page;
@@ -4775,7 +4784,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
hugepage_add_new_anon_rmap(new_page, vma, addr);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
@@ -5115,7 +5123,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@@ -5127,7 +5134,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
-#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@@ -5156,13 +5162,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
-#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5476,8 +5480,6 @@ retry_avoidcopy:
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearHPageRestoreReserve(new_page);
-
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
@@ -5772,10 +5774,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
- if (anon_rmap) {
- ClearHPageRestoreReserve(page);
+ if (anon_rmap)
hugepage_add_new_anon_rmap(page, vma, haddr);
- } else
+ else
page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -6149,6 +6150,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ret = -EIO;
+ if (PageHWPoison(page))
+ goto out_release_unlock;
+
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
@@ -6158,12 +6163,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache) {
+ if (page_in_pagecache)
page_dup_file_rmap(page, true);
- } else {
- ClearHPageRestoreReserve(page);
+ else
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
- }
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6247,6 +6250,72 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
return false;
}
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & huge_page_mask(h);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte, entry;
+
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
+ return NULL;
+
+retry:
+ /*
+ * vma lock prevents racing with another thread doing a pmd unshare.
+ * This keeps pte as returned by huge_pte_offset valid.
+ */
+ hugetlb_vma_lock_read(vma);
+
+ pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ if (!pte) {
+ hugetlb_vma_unlock_read(vma);
+ return NULL;
+ }
+
+ ptl = huge_pte_lock(h, mm, pte);
+ entry = huge_ptep_get(pte);
+ if (pte_present(entry)) {
+ page = pte_page(entry) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ /*
+ * Note that page may be a sub-page, and with vmemmap
+ * optimizations the page struct may be read only.
+ * try_grab_page() will increase the ref count on the
+ * head page, so this will be OK.
+ *
+ * try_grab_page() should always succeed here, because we hold
+ * the ptl lock and have verified pte_present().
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
+ } else {
+ if (is_hugetlb_entry_migration(entry)) {
+ spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+ __migration_entry_wait_huge(pte, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+ return page;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -7239,122 +7308,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
* These functions are overwritable if your architecture needs its own
* behavior.
*/
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
- int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
- unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
- WARN(1, "hugepd follow called with no support for hugepage directory format\n");
- return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
-{
- struct hstate *h = hstate_vma(vma);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t *ptep, pte;
-
- /*
- * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
- * follow_hugetlb_page().
- */
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
- if (!ptep)
- return NULL;
-
- ptl = huge_pte_lock(h, mm, ptep);
- pte = huge_ptep_get(ptep);
- if (pte_present(pte)) {
- page = pte_page(pte) +
- ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
- /*
- * try_grab_page() should always succeed here, because: a) we
- * hold the pmd (ptl) lock, and b) we've just checked that the
- * huge pmd (head) page is present in the page tables. The ptl
- * prevents the head page and tail pages from being rearranged
- * in any way. So this page must be available at this point,
- * unless the page refcount overflowed:
- */
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(ptep, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t pte;
-
- if (WARN_ON_ONCE(flags & FOLL_PIN))
- return NULL;
-
-retry:
- ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
- if (!pud_huge(*pud))
- goto out;
- pte = huge_ptep_get((pte_t *)pud);
- if (pte_present(pte)) {
- page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
- page = NULL;
- goto out;
- }
- } else {
- if (is_hugetlb_entry_migration(pte)) {
- spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pud, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
- }
-out:
- spin_unlock(ptl);
- return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
- if (flags & (FOLL_GET | FOLL_PIN))
- return NULL;
-
- return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
int isolate_hugetlb(struct page *page, struct list_head *list)
{
int ret = 0;
@@ -7373,7 +7326,7 @@ unlock:
return ret;
}
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
{
int ret = 0;
@@ -7383,7 +7336,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
*hugetlb = true;
if (HPageFreed(page))
ret = 0;
- else if (HPageMigratable(page))
+ else if (HPageMigratable(page) || unpoison)
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
@@ -7392,12 +7345,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return ret;
}
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
@@ -7411,15 +7365,15 @@ void putback_active_hugepage(struct page *page)
put_page(page);
}
-void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
- struct hstate *h = page_hstate(oldpage);
+ struct hstate *h = folio_hstate(old_folio);
- hugetlb_cgroup_migrate(oldpage, newpage);
- set_page_owner_migrate_reason(newpage, reason);
+ hugetlb_cgroup_migrate(old_folio, new_folio);
+ set_page_owner_migrate_reason(&new_folio->page, reason);
/*
- * transfer temporary state of the new huge page. This is
+ * transfer temporary state of the new hugetlb folio. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
@@ -7428,12 +7382,14 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (HPageTemporary(newpage)) {
- int old_nid = page_to_nid(oldpage);
- int new_nid = page_to_nid(newpage);
+ if (folio_test_hugetlb_temporary(new_folio)) {
+ int old_nid = folio_nid(old_folio);
+ int new_nid = folio_nid(new_folio);
+
+
+ folio_set_hugetlb_temporary(old_folio);
+ folio_clear_hugetlb_temporary(new_folio);
- SetHPageTemporary(oldpage);
- ClearHPageTemporary(newpage);
/*
* There is no need to transfer the per-node surplus state
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index f61d132df52b..d9e4425d81ac 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct page_counter *counter;
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+ struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_page(page);
+ page_hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
@@ -211,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
/* Take the pages off the local counter */
page_counter_cancel(counter, nr_pages);
- set_hugetlb_cgroup(page, parent);
+ set_hugetlb_cgroup(folio, parent);
out:
return;
}
@@ -309,21 +310,21 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
/* Should be called with hugetlb_lock held */
static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page, bool rsvd)
+ struct folio *folio, bool rsvd)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- __set_hugetlb_cgroup(page, h_cg, rsvd);
+ __set_hugetlb_cgroup(folio, h_cg, rsvd);
if (!rsvd) {
unsigned long usage =
- h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
- WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
usage + nr_pages);
}
}
@@ -332,31 +333,35 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false);
+ struct folio *folio = page_folio(page);
+
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}
void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
struct page *page)
{
- __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true);
+ struct folio *folio = page_folio(page);
+
+ __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}
/*
* Should be called with hugetlb_lock held
*/
-static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page, bool rsvd)
+static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio, bool rsvd)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
- h_cg = __hugetlb_cgroup_from_page(page, rsvd);
+ h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
if (unlikely(!h_cg))
return;
- __set_hugetlb_cgroup(page, NULL, rsvd);
+ __set_hugetlb_cgroup(folio, NULL, rsvd);
page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
rsvd),
@@ -366,27 +371,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
css_put(&h_cg->css);
else {
unsigned long usage =
- h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
- WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
usage - nr_pages);
}
}
-void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
}
-void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages,
- struct page *page)
+void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
+ struct folio *folio)
{
- __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true);
+ __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
}
static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
@@ -883,25 +888,25 @@ void __init hugetlb_cgroup_file_init(void)
* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
* when we migrate hugepages
*/
-void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
{
struct hugetlb_cgroup *h_cg;
struct hugetlb_cgroup *h_cg_rsvd;
- struct hstate *h = page_hstate(oldhpage);
+ struct hstate *h = folio_hstate(old_folio);
if (hugetlb_cgroup_disabled())
return;
spin_lock_irq(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(oldhpage);
- h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage);
- set_hugetlb_cgroup(oldhpage, NULL);
- set_hugetlb_cgroup_rsvd(oldhpage, NULL);
+ h_cg = hugetlb_cgroup_from_folio(old_folio);
+ h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
+ set_hugetlb_cgroup(old_folio, NULL);
+ set_hugetlb_cgroup_rsvd(old_folio, NULL);
/* move the h_cg details to new cgroup */
- set_hugetlb_cgroup(newhpage, h_cg);
- set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd);
- list_move(&newhpage->lru, &h->hugepage_activelist);
+ set_hugetlb_cgroup(new_folio, h_cg);
+ set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
+ list_move(&new_folio->lru, &h->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
return;
}
diff --git a/mm/internal.h b/mm/internal.h
index 6b7ef495b56d..cb4c663a714e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ struct folio_batch;
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC|__GFP_NOLOCKDEP)
+ __GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -106,9 +106,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, nr_to_read);
}
-unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
-unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
@@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
-#ifdef CONFIG_MEMORY_FAILURE
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages);
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index abbcc1b0eec5..a84491bc4867 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -261,14 +261,6 @@ struct kasan_stack_ring {
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-/* Used in KUnit-compatible KASAN tests. */
-struct kunit_kasan_status {
- bool report_found;
- bool sync_fault;
-};
-#endif
-
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 6167e19056f0..7502f03c807c 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -5,8 +5,12 @@
* Author: Andrey Ryabinin <a.ryabinin@samsung.com>
*/
+#define pr_fmt(fmt) "kasan_test: " fmt
+
+#include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/io.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -14,21 +18,28 @@
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/random.h>
+#include <linux/set_memory.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/tracepoint.h>
#include <linux/uaccess.h>
-#include <linux/io.h>
#include <linux/vmalloc.h>
-#include <linux/set_memory.h>
+#include <trace/events/printk.h>
#include <asm/page.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+static bool multishot;
+
+/* Fields set based on lines observed in the console. */
+static struct {
+ bool report_found;
+ bool async_fault;
+} test_status;
+
/*
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
@@ -36,35 +47,61 @@
void *kasan_ptr_result;
int kasan_int_result;
-static struct kunit_resource resource;
-static struct kunit_kasan_status test_status;
-static bool multishot;
+/* Probe for console output: obtains test_status lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ if (strnstr(buf, "BUG: KASAN: ", len))
+ WRITE_ONCE(test_status.report_found, true);
+ else if (strnstr(buf, "Asynchronous fault: ", len))
+ WRITE_ONCE(test_status.async_fault, true);
+}
-/*
- * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the
- * first detected bug and panic the kernel if panic_on_warn is enabled. For
- * hardware tag-based KASAN also allow tag checking to be reenabled for each
- * test, see the comment for KUNIT_EXPECT_KASAN_FAIL().
- */
-static int kasan_test_init(struct kunit *test)
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+static int kasan_suite_init(struct kunit_suite *suite)
{
if (!kasan_enabled()) {
- kunit_err(test, "can't run KASAN tests with KASAN disabled");
+ pr_err("Can't run KASAN tests with KASAN disabled");
return -1;
}
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
multishot = kasan_save_enable_multi_shot();
- test_status.report_found = false;
- test_status.sync_fault = false;
- kunit_add_named_resource(test, NULL, NULL, &resource,
- "kasan_status", &test_status);
+
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
return 0;
}
-static void kasan_test_exit(struct kunit *test)
+static void kasan_suite_exit(struct kunit_suite *suite)
{
kasan_restore_multi_shot(multishot);
- KUNIT_EXPECT_FALSE(test, test_status.report_found);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));
}
/**
@@ -106,11 +143,12 @@ static void kasan_test_exit(struct kunit *test)
if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
kasan_sync_fault_possible()) { \
if (READ_ONCE(test_status.report_found) && \
- READ_ONCE(test_status.sync_fault)) \
+ !READ_ONCE(test_status.async_fault)) \
kasan_enable_tagging(); \
migrate_enable(); \
} \
WRITE_ONCE(test_status.report_found, false); \
+ WRITE_ONCE(test_status.async_fault, false); \
} while (0)
#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
@@ -1103,6 +1141,67 @@ static void kmalloc_double_kzfree(struct kunit *test)
KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
}
+/*
+ * The two tests below check that Generic KASAN prints auxiliary stack traces
+ * for RCU callbacks and workqueues. The reports need to be inspected manually.
+ *
+ * These tests are still enabled for other KASAN modes to make sure that all
+ * modes report bad accesses in tested scenarios.
+ */
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static void rcu_uaf_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp =
+ container_of(rp, struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+}
+
+static void rcu_uaf(struct kunit *test)
+{
+ struct kasan_rcu_info *ptr;
+
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ global_rcu_ptr = rcu_dereference_protected(
+ (struct kasan_rcu_info __rcu *)ptr, NULL);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim);
+ rcu_barrier());
+}
+
+static void workqueue_uaf_work(struct work_struct *work)
+{
+ kfree(work);
+}
+
+static void workqueue_uaf(struct kunit *test)
+{
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_workqueue_test");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue);
+
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work);
+
+ INIT_WORK(work, workqueue_uaf_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ((volatile struct work_struct *)work)->data);
+}
+
static void vmalloc_helpers_tags(struct kunit *test)
{
void *ptr;
@@ -1434,6 +1533,8 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kasan_bitops_generic),
KUNIT_CASE(kasan_bitops_tags),
KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(rcu_uaf),
+ KUNIT_CASE(workqueue_uaf),
KUNIT_CASE(vmalloc_helpers_tags),
KUNIT_CASE(vmalloc_oob),
KUNIT_CASE(vmap_tags),
@@ -1447,9 +1548,10 @@ static struct kunit_case kasan_kunit_test_cases[] = {
static struct kunit_suite kasan_kunit_test_suite = {
.name = "kasan",
- .init = kasan_test_init,
.test_cases = kasan_kunit_test_cases,
.exit = kasan_test_exit,
+ .suite_init = kasan_suite_init,
+ .suite_exit = kasan_suite_exit,
};
kunit_test_suite(kasan_kunit_test_suite);
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
index e4ca82dc2c16..7be7bed456ef 100644
--- a/mm/kasan/kasan_test_module.c
+++ b/mm/kasan/kasan_test_module.c
@@ -62,64 +62,6 @@ static noinline void __init copy_user_test(void)
kfree(kmem);
}
-static struct kasan_rcu_info {
- int i;
- struct rcu_head rcu;
-} *global_rcu_ptr;
-
-static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
-{
- struct kasan_rcu_info *fp = container_of(rp,
- struct kasan_rcu_info, rcu);
-
- kfree(fp);
- ((volatile struct kasan_rcu_info *)fp)->i;
-}
-
-static noinline void __init kasan_rcu_uaf(void)
-{
- struct kasan_rcu_info *ptr;
-
- pr_info("use-after-free in kasan_rcu_reclaim\n");
- ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
- if (!ptr) {
- pr_err("Allocation failed\n");
- return;
- }
-
- global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
- call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
-}
-
-static noinline void __init kasan_workqueue_work(struct work_struct *work)
-{
- kfree(work);
-}
-
-static noinline void __init kasan_workqueue_uaf(void)
-{
- struct workqueue_struct *workqueue;
- struct work_struct *work;
-
- workqueue = create_workqueue("kasan_wq_test");
- if (!workqueue) {
- pr_err("Allocation failed\n");
- return;
- }
- work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
- if (!work) {
- pr_err("Allocation failed\n");
- return;
- }
-
- INIT_WORK(work, kasan_workqueue_work);
- queue_work(workqueue, work);
- destroy_workqueue(workqueue);
-
- pr_info("use-after-free on workqueue\n");
- ((volatile struct work_struct *)work)->data;
-}
-
static int __init test_kasan_module_init(void)
{
/*
@@ -130,8 +72,6 @@ static int __init test_kasan_module_init(void)
bool multishot = kasan_save_enable_multi_shot();
copy_user_test();
- kasan_rcu_uaf();
- kasan_workqueue_uaf();
kasan_restore_multi_shot(multishot);
return -EAGAIN;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index df3602062bfd..31355851a5ec 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -30,8 +30,6 @@
#include <asm/sections.h>
-#include <kunit/test.h>
-
#include "kasan.h"
#include "../slab.h"
@@ -114,41 +112,12 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
#endif
-#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-static void update_kunit_status(bool sync)
-{
- struct kunit *test;
- struct kunit_resource *resource;
- struct kunit_kasan_status *status;
-
- test = current->kunit_test;
- if (!test)
- return;
-
- resource = kunit_find_named_resource(test, "kasan_status");
- if (!resource) {
- kunit_set_failure(test);
- return;
- }
-
- status = (struct kunit_kasan_status *)resource->data;
- WRITE_ONCE(status->report_found, true);
- WRITE_ONCE(status->sync_fault, sync);
-
- kunit_put_resource(resource);
-}
-#else
-static void update_kunit_status(bool sync) { }
-#endif
-
static DEFINE_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
/* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
disable_trace_on_warning();
- /* Update status of the currently running KASAN test. */
- update_kunit_status(sync);
/* Do not allow LOCKDEP mangling KASAN reports. */
lockdep_off();
/* Make sure we don't end up in loop. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 0e3648b603a6..2fba1f51f042 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb,
static int __init kasan_memhotplug_init(void)
{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
+ hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index c19fcca9bc03..dc15c4a2a6ff 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -39,6 +39,7 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -419,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret;
+
+ if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (pte_present(*pte)) {
+ page = vm_normal_page(walk->vma, addr, *pte);
+ } else if (!pte_none(*pte)) {
+ swp_entry_t entry = pte_to_swp_entry(*pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ ret = page && PageKsm(page);
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+};
+
/*
- * We use break_ksm to break COW on a ksm page: it's a stripped down
- *
- * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
- * put_page(page);
+ * We use break_ksm to break COW on a ksm page by triggering unsharing,
+ * such that the ksm page will get replaced by an exclusive anonymous page.
*
- * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem, where we would not want to touch it.
*
- * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
vm_fault_t ret = 0;
do {
+ int ksm_page;
+
cond_resched();
- page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page))
- break;
- if (PageKsm(page))
- ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
- NULL);
- else
- ret = VM_FAULT_WRITE;
- put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1,
+ &break_ksm_ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
+ if (!ksm_page)
+ return 0;
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
- * We must loop because handle_mm_fault() may back out if there's
- * any difficulty e.g. if pte accessed bit gets updated concurrently.
- *
- * VM_FAULT_WRITE is what we have been hoping for: it indicates that
- * COW has been broken, even if the vma does not permit VM_WRITE;
- * but note that a concurrent fault might break PageKsm for us.
+ * We must loop until we no longer find a KSM page because
+ * handle_mm_fault() may back out if there's any difficulty e.g. if
+ * pte accessed bit gets updated concurrently.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
@@ -3211,7 +3239,7 @@ static int __init ksm_init(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
- hotplug_memory_notifier(ksm_memory_callback, 100);
+ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..fd707dcd6d04 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1219,7 +1219,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
- if (last != root_mem_cgroup)
+ if (!mem_cgroup_is_root(last))
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
@@ -1243,7 +1243,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
+ BUG_ON(mem_cgroup_is_root(memcg));
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1272,7 +1272,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
memcg = folio_memcg(folio);
if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
+ VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
else
VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
@@ -2036,7 +2036,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
/*
@@ -2995,7 +2995,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
@@ -5648,15 +5648,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
+ unsigned long index;
+ struct folio *folio;
+
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* folio is moved even if it's not RSS of this task(page-faulted). */
/* shmem/tmpfs may report page out on swap: account for that too. */
- return find_get_incore_page(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
+ index = linear_page_index(vma, addr);
+ folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
+ if (!folio)
+ return NULL;
+ return folio_file_page(folio, index);
}
/**
@@ -7163,7 +7169,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
@@ -7298,7 +7304,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
- if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
VM_BUG_ON(1);
break;
}
@@ -7462,7 +7468,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
@@ -7484,7 +7490,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
@@ -7648,7 +7654,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
return true;
original_memcg = get_mem_cgroup_from_objcg(objcg);
- for (memcg = original_memcg; memcg != root_mem_cgroup;
+ for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
memcg = parent_mem_cgroup(memcg)) {
unsigned long max = READ_ONCE(memcg->zswap_max);
unsigned long pages;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..779a426d2cab 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,6 +74,19 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
static bool hw_memory_failure __read_mostly = false;
+inline void num_poisoned_pages_inc(unsigned long pfn)
+{
+ atomic_long_inc(&num_poisoned_pages);
+ memblk_nr_poison_inc(pfn);
+}
+
+inline void num_poisoned_pages_sub(unsigned long pfn, long i)
+{
+ atomic_long_sub(i, &num_poisoned_pages);
+ if (pfn != -1UL)
+ memblk_nr_poison_sub(pfn, i);
+}
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -115,7 +128,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
if (release)
put_page(page);
page_ref_inc(page);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
return true;
}
@@ -1080,6 +1093,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1087,6 +1101,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1104,7 +1120,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
@@ -1179,14 +1195,16 @@ static struct page_state error_states[] = {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, enum mf_action_page_type type,
- enum mf_result result)
+static int action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
trace_memory_failure_event(pfn, type, result);
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(pfn);
pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
+
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1197,14 +1215,12 @@ static int page_action(struct page_state *ps, struct page *p,
/* page p should be unlocked after returning from ps->action(). */
result = ps->action(ps, p);
- action_result(pfn, ps->type, result);
-
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+ return action_result(pfn, ps->type, result);
}
static inline bool PageHWPoisonTakenOff(struct page *page)
@@ -1244,7 +1260,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, false);
if (hugetlb)
return ret;
@@ -1334,7 +1350,7 @@ static int __get_unpoison_page(struct page *page)
int ret = 0;
bool hugetlb = false;
- ret = get_hwpoison_huge_page(head, &hugetlb);
+ ret = get_hwpoison_huge_page(head, &hugetlb, true);
if (hugetlb)
return ret;
@@ -1696,6 +1712,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
if (move_flag)
SetPageHWPoison(p->page);
+ else
+ num_poisoned_pages_sub(page_to_pfn(p->page), 1);
kfree(p);
count++;
}
@@ -1731,7 +1749,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
llist_add(&raw_hwp->node, head);
/* the first error event will be counted in action_result(). */
if (ret)
- num_poisoned_pages_inc();
+ num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1785,7 +1803,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage)
* -EBUSY - the hugepage is busy (try to retry)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+ bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
struct page *head = compound_head(page);
@@ -1815,6 +1834,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
goto out;
}
+ /*
+ * Clearing HPageMigratable for hwpoisoned hugepages to prevent them
+ * from being migrated by memory hotremove.
+ */
+ if (count_increased && HPageMigratable(head)) {
+ ClearHPageMigratable(head);
+ *migratable_cleared = true;
+ }
+
return ret;
out:
if (count_increased)
@@ -1834,10 +1862,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct page *head;
unsigned long page_flags;
+ bool migratable_cleared = false;
*hugetlb = 1;
retry:
- res = get_huge_page_for_hwpoison(pfn, flags);
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
@@ -1853,8 +1882,7 @@ retry:
flags |= MF_NO_RETRY;
goto retry;
}
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- return res;
+ return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
}
head = compound_head(p);
@@ -1862,6 +1890,8 @@ retry:
if (hwpoison_filter(p)) {
hugetlb_clear_page_hwpoison(head);
+ if (migratable_cleared)
+ SetHPageMigratable(head);
unlock_page(head);
if (res == 1)
put_page(head);
@@ -1880,22 +1910,17 @@ retry:
} else {
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_FREE_HUGE, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
+ return action_result(pfn, MF_MSG_FREE_HUGE, res);
}
page_flags = head->flags;
if (!hwpoison_user_mappings(p, pfn, flags, head)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
- goto out;
+ unlock_page(head);
+ return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
}
return identify_page_state(pfn, p, page_flags);
-out:
- unlock_page(head);
- return res;
}
#else
@@ -1910,17 +1935,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
}
#endif /* CONFIG_HUGETLB_PAGE */
+/* Drop the extra refcount in case we come from madvise() */
+static void put_ref_page(unsigned long pfn, int flags)
+{
+ struct page *page;
+
+ if (!(flags & MF_COUNT_INCREASED))
+ return;
+
+ page = pfn_to_page(pfn);
+ if (page)
+ put_page(page);
+}
+
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
- struct page *page = pfn_to_page(pfn);
int rc = -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- /*
- * Drop the extra refcount in case we come from madvise().
- */
- put_page(page);
+ put_ref_page(pfn, flags);
/* device metadata space is not recoverable */
if (!pgmap_pfn_valid(pgmap, pfn))
@@ -2052,16 +2085,13 @@ try_again:
}
res = MF_FAILED;
}
- action_result(pfn, MF_MSG_BUDDY, res);
- res = res == MF_RECOVERED ? 0 : -EBUSY;
+ res = action_result(pfn, MF_MSG_BUDDY, res);
} else {
- action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
}
goto unlock_mutex;
} else if (res < 0) {
- action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
goto unlock_mutex;
}
}
@@ -2082,8 +2112,7 @@ try_again:
*/
SetPageHasHWPoisoned(hpage);
if (try_to_split_thp_page(p) < 0) {
- action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -2116,8 +2145,7 @@ try_again:
retry = false;
goto try_again;
}
- action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
goto unlock_page;
}
@@ -2157,8 +2185,7 @@ try_again:
* Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
if (!hwpoison_user_mappings(p, pfn, flags, p)) {
- action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
goto unlock_page;
}
@@ -2166,8 +2193,7 @@ try_again:
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
- res = -EBUSY;
+ res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
goto unlock_page;
}
@@ -2314,6 +2340,7 @@ int unpoison_memory(unsigned long pfn)
int ret = -EBUSY;
int freeit = 0;
unsigned long count = 1;
+ bool huge = false;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -2362,6 +2389,7 @@ int unpoison_memory(unsigned long pfn)
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
@@ -2377,6 +2405,7 @@ int unpoison_memory(unsigned long pfn)
pfn, &unpoison_rs);
} else {
if (PageHuge(p)) {
+ huge = true;
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
@@ -2396,7 +2425,8 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
if (!ret || freeit) {
- num_poisoned_pages_sub(count);
+ if (!huge)
+ num_poisoned_pages_sub(pfn, 1);
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), &unpoison_rs);
}
@@ -2513,12 +2543,6 @@ static int soft_offline_in_use_page(struct page *page)
return ret;
}
-static void put_ref_page(struct page *page)
-{
- if (page)
- put_page(page);
-}
-
/**
* soft_offline_page - Soft offline a page.
* @pfn: pfn to soft-offline
@@ -2547,19 +2571,17 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
bool try_again = true;
- struct page *page, *ref_page = NULL;
-
- WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
+ struct page *page;
- if (!pfn_valid(pfn))
+ if (!pfn_valid(pfn)) {
+ WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
return -ENXIO;
- if (flags & MF_COUNT_INCREASED)
- ref_page = pfn_to_page(pfn);
+ }
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
page = pfn_to_online_page(pfn);
if (!page) {
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
return -EIO;
}
@@ -2567,7 +2589,7 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
- put_ref_page(ref_page);
+ put_ref_page(pfn, flags);
mutex_unlock(&mf_mutex);
return 0;
}
@@ -2599,26 +2621,3 @@ retry:
return ret;
}
-
-void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
- int i, total = 0;
-
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
- return;
-
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- total++;
- ClearPageHWPoison(&memmap[i]);
- }
- }
- if (total)
- num_poisoned_pages_sub(total);
-}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index fa8c9d07f9ce..939e200c283b 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -664,7 +664,7 @@ static int __init memory_tier_init(void)
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
- hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
+ hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
return 0;
}
subsys_initcall(memory_tier_init);
diff --git a/mm/memory.c b/mm/memory.c
index f88c351aecd4..c5599a9279b1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -162,58 +162,11 @@ static int __init init_zero_pfn(void)
}
early_initcall(init_zero_pfn);
-void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
- trace_rss_stat(mm, member, count);
+ trace_rss_stat(mm, member);
}
-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
-#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
-#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
-
-static void check_sync_rss_stat(struct task_struct *task)
-{
-}
-
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1393,12 +1346,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
-#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1860,7 +1811,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
@@ -2848,10 +2799,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
return same;
}
-static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+/*
+ * Return:
+ * 0: copied succeeded
+ * -EHWPOISON: copy failed due to hwpoison in source page
+ * -EAGAIN: copied failed (some other reason)
+ */
+static inline int __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
- bool ret;
+ int ret;
void *kaddr;
void __user *uaddr;
bool locked = false;
@@ -2860,8 +2817,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
unsigned long addr = vmf->address;
if (likely(src)) {
- copy_user_highpage(dst, src, addr, vma);
- return true;
+ if (copy_mc_user_highpage(dst, src, addr, vma)) {
+ memory_failure_queue(page_to_pfn(src), 0);
+ return -EHWPOISON;
+ }
+ return 0;
}
/*
@@ -2888,7 +2848,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* and update local tlb only
*/
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2913,7 +2873,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte);
- ret = false;
+ ret = -EAGAIN;
goto pte_unlock;
}
@@ -2932,7 +2892,7 @@ warn:
}
}
- ret = true;
+ ret = 0;
pte_unlock:
if (locked)
@@ -3104,6 +3064,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
+ int ret;
delayacct_wpcopy_start();
@@ -3121,19 +3082,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (!new_page)
goto oom;
- if (!__wp_page_copy_user(new_page, old_page, vmf)) {
+ ret = __wp_page_copy_user(new_page, old_page, vmf);
+ if (ret) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
+ * The -EHWPOISON case will not be retried.
*/
put_page(new_page);
if (old_page)
put_page(old_page);
delayacct_wpcopy_end();
- return 0;
+ return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
kmsan_copy_page_meta(new_page, old_page);
}
@@ -3156,12 +3119,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm,
- mm_counter_file(old_page));
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter(mm, mm_counter_file(old_page));
+ inc_mm_counter(mm, MM_ANONPAGES);
}
} else {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
@@ -3242,7 +3204,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
delayacct_wpcopy_end();
- return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
+ return 0;
oom_free_new:
put_page(new_page);
oom:
@@ -3306,14 +3268,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- vm_fault_t ret = VM_FAULT_WRITE;
+ vm_fault_t ret = 0;
get_page(vmf->page);
@@ -3464,7 +3426,7 @@ reuse:
return 0;
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
} else if (unshare) {
/* No anonymous page -> nothing to do. */
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3968,8 +3930,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (should_try_to_free_swap(folio, vma, vmf->flags))
folio_free_swap(folio);
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
/*
@@ -3983,7 +3945,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->flags & FAULT_FLAG_WRITE) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
}
rmap_flags |= RMAP_EXCLUSIVE;
}
@@ -4149,7 +4110,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_MISSING);
}
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
@@ -4339,11 +4300,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ inc_mm_counter(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
@@ -5195,9 +5156,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
diff --git a/mm/mempool.c b/mm/mempool.c
index 96488b13a1ef..0f3107b28e6b 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -526,7 +526,7 @@ EXPORT_SYMBOL(mempool_free_slab);
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
- size_t size = (size_t)pool_data;
+ size_t size = kmalloc_size_roundup((size_t)pool_data);
return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
diff --git a/mm/memremap.c b/mm/memremap.c
index 421bec3a29ee..09b20a337db9 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -94,19 +94,6 @@ bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
return false;
}
-static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
-{
- const struct range *range = &pgmap->ranges[range_id];
-
- return (range->start + range_len(range)) >> PAGE_SHIFT;
-}
-
-static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
-{
- return (pfn_end(pgmap, range_id) -
- pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
-}
-
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
@@ -138,10 +125,6 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
- if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
- pgmap->type != MEMORY_DEVICE_COHERENT)
- for (i = 0; i < pgmap->nr_range; i++)
- percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
@@ -267,9 +250,6 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
- pgmap->type != MEMORY_DEVICE_COHERENT)
- percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -469,8 +449,10 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_page(struct page *page)
{
- if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
- return;
+ struct dev_pagemap *pgmap = page->pgmap;
+
+ /* wake filesystem 'break dax layouts' waiters */
+ wake_up_var(page);
mem_cgroup_uncharge(page_folio(page));
@@ -505,45 +487,65 @@ void free_zone_device_page(struct page *page)
* to clear page->mapping.
*/
page->mapping = NULL;
- page->pgmap->ops->page_free(page);
-
- if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
- page->pgmap->type != MEMORY_DEVICE_COHERENT)
- /*
- * Reset the page count to 1 to prepare for handing out the page
- * again.
- */
- set_page_count(page, 1);
- else
- put_dev_pagemap(page->pgmap);
+ if (pgmap->ops && pgmap->ops->page_free)
+ pgmap->ops->page_free(page);
+ put_dev_pagemap(page->pgmap);
}
-void zone_device_page_init(struct page *page)
+static unsigned long pgmap_offset_to_pfn(struct dev_pagemap *pgmap,
+ pgoff_t pgmap_offset)
{
- /*
- * Drivers shouldn't be allocating pages after calling
- * memunmap_pages().
- */
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
- set_page_count(page, 1);
- lock_page(page);
+ u64 sum = 0, offset = PFN_PHYS(pgmap_offset);
+ int i;
+
+ for (i = 0; i < pgmap->nr_range; i++) {
+ struct range *range = &pgmap->ranges[i];
+
+ if (offset >= sum && offset < (sum + range_len(range)))
+ return PHYS_PFN(range->start + offset - sum);
+ sum += range_len(range);
+ }
+
+ return -1;
}
-EXPORT_SYMBOL_GPL(zone_device_page_init);
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page_refs(struct page *page, int refs)
+/**
+ * pgmap_request_folio - activate a folio of a given order in @pgmap
+ * @pgmap: host page map of the folio to activate
+ * @pgmap_offset: page-offset into the pgmap to request
+ * @order: expected folio_order() of the folio
+ *
+ * Caller is responsible for @pgmap remaining live for the duration of
+ * this call. The order (size) of the folios in the pgmap are assumed
+ * stable before this call.
+ */
+struct folio *pgmap_request_folio(struct dev_pagemap *pgmap,
+ pgoff_t pgmap_offset, int order)
{
- if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
+ unsigned long pfn = pgmap_offset_to_pfn(pgmap, pgmap_offset);
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio;
+ int v;
- /*
- * fsdax page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (page_ref_sub_return(page, refs) == 1)
- wake_up_var(&page->_refcount);
- return true;
+ if (WARN_ON_ONCE(page->pgmap != pgmap))
+ return NULL;
+
+ if (WARN_ON_ONCE(percpu_ref_is_dying(&pgmap->ref)))
+ return NULL;
+
+ folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_order(folio) != order))
+ return NULL;
+
+ v = folio_ref_inc_return(folio);
+ if (v > 1)
+ return folio;
+
+ if (WARN_ON_ONCE(!percpu_ref_tryget(&pgmap->ref))) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
}
-EXPORT_SYMBOL(__put_devmap_managed_page_refs);
-#endif /* CONFIG_FS_DAX */
+EXPORT_SYMBOL_GPL(pgmap_request_folio);
diff --git a/mm/migrate.c b/mm/migrate.c
index ac8041963443..967fc0fb1153 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1362,7 +1362,7 @@ put_anon:
put_anon_vma(anon_vma);
if (rc == MIGRATEPAGE_SUCCESS) {
- move_hugetlb_state(hpage, new_hpage, reason);
+ move_hugetlb_state(src, dst, reason);
put_new_page = NULL;
}
@@ -1520,9 +1520,22 @@ thp_subpage_migration:
if (is_thp) {
nr_thp_failed++;
/* THP NUMA faulting doesn't split THP to retry. */
- if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
- nr_thp_split++;
- break;
+ if (!nosplit) {
+ int ret = try_split_thp(page, &thp_split_pages);
+
+ if (!ret) {
+ nr_thp_split++;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split THP to mitigate
+ * the failure of longterm pinning.
+ */
+ thp_retry++;
+ nr_retry_pages += nr_subpages;
+ break;
+ }
}
} else if (!no_subpage_counting) {
nr_failed++;
@@ -1634,7 +1647,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
nid = folio_nid(folio);
if (folio_test_hugetlb(folio)) {
- struct hstate *h = page_hstate(&folio->page);
+ struct hstate *h = folio_hstate(folio);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
diff --git a/mm/mincore.c b/mm/mincore.c
index fa200c14185f..a085a2aeabd8 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
- struct page *page;
+ struct folio *folio;
/*
* When tmpfs swaps out a page from a file, any process mapping that
@@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_incore_page(mapping, index);
- if (page) {
- present = PageUptodate(page);
- put_page(page);
+ folio = filemap_get_incore_folio(mapping, index);
+ if (folio) {
+ present = folio_test_uptodate(folio);
+ folio_put(folio);
}
return present;
@@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
unsigned long end;
int err;
- vma = find_vma(current->mm, addr);
- if (!vma || addr < vma->vm_start)
+ vma = vma_lookup(current->mm, addr);
+ if (!vma)
return -ENOMEM;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
if (!can_do_mincore(vma)) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 0d7b2bd2454a..c1883362e71d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -178,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
static int __init mm_compute_batch_init(void)
{
mm_compute_batch(sysctl_overcommit_memory);
- register_hotmemory_notifier(&compute_batch_nb);
-
+ hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 2def55555e05..c697771d406b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3749,13 +3749,9 @@ static int reserve_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
-};
-
static int __meminit init_reserve_notifier(void)
{
- if (register_hotmemory_notifier(&reserve_mem_nb))
+ if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 668bfaa6ed2a..8d770855b591 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -267,7 +267,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@@ -279,7 +278,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
-#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -756,8 +754,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
* If a permission is not passed to mprotect(), it must be
* cleared from the VMA.
*/
- mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
- VM_FLAGS_CLEAR;
+ mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
newflags = calc_vm_prot_bits(prot, new_vma_pkey);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 218b28ee49ed..fa15f04ff31d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4070,12 +4070,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
free_pages))
return true;
/*
- * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * Ignore watermark boosting for GFP_HIGH order-0 allocations
* when checking the min watermark. The min watermark is the
* point where boosting is ignored so that kswapd is woken up
* when below the low watermark.
*/
- if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost
&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
mark = z->_watermark[WMARK_MIN];
return __zone_watermark_ok(z, order, mark, highest_zoneidx,
@@ -4834,12 +4834,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH.
*/
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
- if (gfp_mask & __GFP_ATOMIC) {
+ if (gfp_mask & __GFP_HIGH) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
@@ -5033,14 +5033,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int zonelist_iter_cookie;
int reserve_flags;
- /*
- * We also sanity check to catch abuse of atomic reserves being used by
- * callers that are not in atomic context.
- */
- if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
- (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
- gfp_mask &= ~__GFP_ATOMIC;
-
restart:
compaction_retries = 0;
no_progress_loops = 0;
@@ -6796,6 +6788,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
{
__init_single_page(page, pfn, zone_idx, nid);
+ set_page_count(page, 0);
/*
* Mark page reserved as it will need to wait for onlining
@@ -6828,14 +6821,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
-
- /*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
- */
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
- set_page_count(page, 0);
}
/*
diff --git a/mm/page_ext.c b/mm/page_ext.c
index affe80243b6d..b2ff5c9129f4 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -513,7 +513,7 @@ void __init page_ext_init(void)
cond_resched();
}
}
- hotplug_memory_notifier(page_ext_callback, 0);
+ hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
pr_info("allocated %ld bytes of page_ext\n", total_usage);
invoke_init_callbacks();
return;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2ff3a5bebceb..7f1c9b274906 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -517,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
@@ -526,18 +546,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
.vma = vma,
.private = private,
};
- int err;
if (!walk.mm)
return -EINVAL;
mmap_assert_locked(walk.mm);
-
- err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
- if (err > 0)
- return 0;
- if (err < 0)
- return err;
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ec925e5fa6a..3b2d18bbdc44 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
enomem_failure:
/*
- * dst->anon_vma is dropped here otherwise its degree can be incorrectly
- * decremented in unlink_anon_vmas().
+ * dst->anon_vma is dropped here otherwise its num_active_vmas can
+ * be incorrectly decremented in unlink_anon_vmas().
* We can safely do this because callers of anon_vma_clone() don't care
* about dst->anon_vma if anon_vma_clone() failed.
*/
@@ -1801,7 +1801,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_not_mapped(struct folio *folio)
+static int folio_not_mapped(struct folio *folio)
{
return !folio_mapped(folio);
}
@@ -1822,7 +1822,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -2150,7 +2150,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
};
@@ -2297,7 +2297,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
};
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
- .done = page_not_mapped,
+ .done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
.arg = &args,
};
@@ -2571,7 +2571,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
-
+ ClearHPageRestoreReserve(page);
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 33071ad8b684..bc9b84602eec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,21 +922,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
-
if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
if (!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
@@ -945,7 +942,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
@@ -977,7 +973,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@@ -989,13 +985,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
- index = indices[i];
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, folio)) {
+ if (shmem_free_swap(mapping, indices[i], folio)) {
/* Swap was replaced by page: retry */
- index--;
+ index = indices[i];
break;
}
nr_swaps_freed++;
@@ -1008,19 +1003,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
folio_unlock(folio);
- index--;
+ index = indices[i];
break;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
truncate_inode_folio(mapping, folio);
}
- index = folio->index + folio_nr_pages(folio) - 1;
folio_unlock(folio);
}
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
- index++;
}
spin_lock_irq(&info->lock);
@@ -1833,7 +1826,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct folio *folio;
- pgoff_t hindex = index;
+ pgoff_t hindex;
gfp_t huge_gfp;
int error;
int once = 0;
@@ -1871,7 +1864,6 @@ repeat:
}
if (folio) {
- hindex = folio->index;
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
@@ -3910,6 +3902,7 @@ EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
+ .open = generic_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
diff --git a/mm/slub.c b/mm/slub.c
index aaf62f98a5b3..a870cd9fdc64 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4763,11 +4763,6 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
-};
-
/********************************************************************
* Basic setup of slabs
*******************************************************************/
@@ -4833,7 +4828,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
diff --git a/mm/sparse.c b/mm/sparse.c
index e5a8a3a0edd7..2779b419ef2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
- nr_pages - map_offset);
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 955930f41d20..5063e6510963 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -43,8 +43,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>
-/* How many pages do we try to swap or page in/out together? */
+/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
+const int page_cluster_max = 31;
/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
@@ -295,8 +296,20 @@ void folio_rotate_reclaimable(struct folio *folio)
}
}
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
{
+ unsigned long cost;
+
+ /*
+ * Reflect the relative cost of incurring IO and spending CPU
+ * time on rotations. This doesn't attempt to make a precise
+ * comparison, it just says: if reloads are about comparable
+ * between the LRU lists, or rotations are overwhelmingly
+ * different between them, adjust scan balance for CPU work.
+ */
+ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+
do {
unsigned long lrusize;
@@ -310,9 +323,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
- lruvec->file_cost += nr_pages;
+ lruvec->file_cost += cost;
else
- lruvec->anon_cost += nr_pages;
+ lruvec->anon_cost += cost;
/*
* Decay previous events
@@ -335,10 +348,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_folio(struct folio *folio)
+void lru_note_cost_refault(struct folio *folio)
{
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio));
+ folio_nr_pages(folio), 0);
}
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
@@ -1003,8 +1016,6 @@ void release_pages(struct page **pages, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(&folio->page))
- continue;
if (folio_put_testzero(folio))
free_zone_device_page(&folio->page);
continue;
diff --git a/mm/swap.h b/mm/swap.h
index cc08c459c619..f78065c8ef52 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -105,9 +106,10 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
}
static inline
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
- return find_get_page(mapping, index);
+ return filemap_get_folio(mapping, index);
}
static inline bool add_to_swap(struct folio *folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 438d0676c5be..40fe6f23e105 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -373,30 +373,28 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
}
/**
- * find_get_incore_page - Find and get a page from the page or swap caches.
+ * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
* @mapping: The address_space to search.
* @index: The page cache index.
*
- * This differs from find_get_page() in that it will also look for the
- * page in the swap cache.
+ * This differs from filemap_get_folio() in that it will also look for the
+ * folio in the swap cache.
*
- * Return: The found page or %NULL.
+ * Return: The found folio or %NULL.
*/
-struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+struct folio *filemap_get_incore_folio(struct address_space *mapping,
+ pgoff_t index)
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct page *page = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD, 0);
+ struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0);
- if (!page)
- return page;
- if (!xa_is_value(page))
- return find_subpage(page, index);
+ if (!xa_is_value(folio))
+ goto out;
if (!shmem_mapping(mapping))
return NULL;
- swp = radix_to_swp_entry(page);
+ swp = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swp))
return NULL;
@@ -404,9 +402,11 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
si = get_swap_device(swp);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ index = swp_offset(swp);
+ folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
- return page;
+out:
+ return folio;
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
diff --git a/mm/truncate.c b/mm/truncate.c
index c0be77e5c008..c7bfd247a651 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (index < end && find_lock_entries(mapping, index, end - 1,
+ while (index < end && find_lock_entries(mapping, &index, end - 1,
&fbatch, indices)) {
- index = indices[folio_batch_count(&fbatch) - 1] + 1;
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
for (i = 0; i < folio_batch_count(&fbatch); i++)
truncate_cleanup_folio(fbatch.folios[i]);
@@ -401,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ if (!find_get_entries(mapping, &index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
@@ -415,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
if (xa_is_value(folio))
continue;
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
- index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
folio_batch_release(&fbatch);
- index++;
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -510,20 +506,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
int i;
folio_batch_init(&fbatch);
- while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
count += invalidate_exceptional_entry(mapping,
- index,
- folio);
+ indices[i], folio);
continue;
}
- index += folio_nr_pages(folio) - 1;
ret = mapping_evict_folio(mapping, folio);
folio_unlock(folio);
@@ -542,7 +535,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
return count;
}
@@ -641,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_init(&fbatch);
index = start;
- while (find_get_entries(mapping, index, end, &fbatch, indices)) {
+ while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing folio->index */
- index = indices[i];
if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, folio))
+ indices[i], folio))
ret = -EBUSY;
continue;
}
@@ -660,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
+ unmap_mapping_pages(mapping, indices[i],
+ (1 + end - indices[i]), false);
did_range_unmap = 1;
}
folio_lock(folio);
- VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
if (folio->mapping != mapping) {
folio_unlock(folio);
continue;
@@ -689,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
- index++;
}
/*
* For DAX we invalidate page tables after invalidating page cache. We
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ccaa461998f3..ca71de7c9d77 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -43,6 +43,9 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
#include "internal.h"
#include "pgalloc-track.h"
@@ -1620,6 +1623,8 @@ retry:
size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
+ trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
@@ -1725,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void);
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
+ unsigned int num_purged_areas = 0;
struct list_head local_purge_list;
struct vmap_area *va, *n_va;
@@ -1736,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
spin_unlock(&purge_vmap_area_lock);
if (unlikely(list_empty(&local_purge_list)))
- return false;
+ goto out;
start = min(start,
list_first_entry(&local_purge_list,
@@ -1771,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
+ num_purged_areas++;
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
cond_resched_lock(&free_vmap_area_lock);
}
spin_unlock(&free_vmap_area_lock);
- return true;
+
+out:
+ trace_purge_vmap_area_lazy(start, end, num_purged_areas);
+ return num_purged_areas > 0;
}
/*
@@ -1811,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
+ unsigned long nr_lazy_max = lazy_max_pages();
+ unsigned long va_start = va->va_start;
unsigned long nr_lazy;
spin_lock(&vmap_area_lock);
@@ -1828,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
&purge_vmap_area_root, &purge_vmap_area_list);
spin_unlock(&purge_vmap_area_lock);
+ trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
+
/* After this point, we may free va at any time */
- if (unlikely(nr_lazy > lazy_max_pages()))
+ if (unlikely(nr_lazy > nr_lazy_max))
schedule_work(&drain_vmap_work);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..18f6497994ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
@@ -5844,8 +5846,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
if (lru_gen_enabled()) {
lru_gen_shrink_lruvec(lruvec, sc);
@@ -5868,8 +5870,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -5889,7 +5891,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -5940,8 +5942,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
diff --git a/mm/workingset.c b/mm/workingset.c
index ae7e984b23c6..d2d02978588c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow)
if (workingset) {
folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- lru_note_cost_folio(folio);
+ lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 1e5e66ae5a52..4e187202e77a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -702,6 +702,17 @@ sub find_standard_signature {
return "";
}
+our $obsolete_archives = qr{(?xi:
+ \Qfreedesktop.org/archives/dri-devel\E |
+ \Qlists.infradead.org\E |
+ \Qlkml.org\E |
+ \Qmail-archive.com\E |
+ \Qmailman.alsa-project.org/pipermail\E |
+ \Qmarc.info\E |
+ \Qozlabs.org/pipermail\E |
+ \Qspinics.net\E
+)};
+
our @typeListMisordered = (
qr{char\s+(?:un)?signed},
qr{int\s+(?:(?:un)?signed\s+)?short\s},
@@ -3324,6 +3335,12 @@ sub process {
$last_git_commit_id_linenr = $linenr if ($line =~ /\bcommit\s*$/i);
}
+# Check for mailing list archives other than lore.kernel.org
+ if ($rawline =~ m{\b$obsolete_archives}) {
+ WARN("PREFER_LORE_ARCHIVE",
+ "Use lore.kernel.org archive links when possible - see https://lore.kernel.org/lists.html\n" . $herecurr);
+ }
+
# Check for added, moved or deleted files
if (!$reported_maintainer_file && !$in_commit_log &&
($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ ||
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ebfab2ca1702..4a06d83f2ac5 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -640,7 +640,6 @@ static const struct {
{ "__GFP_HIGHMEM", "HM" },
{ "GFP_DMA32", "D32" },
{ "__GFP_HIGH", "H" },
- { "__GFP_ATOMIC", "_A" },
{ "__GFP_IO", "I" },
{ "__GFP_FS", "F" },
{ "__GFP_NOWARN", "NWR" },
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 35082671928a..3620c9e5f7c5 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * maple_tree.c: Userspace shim for maple tree test-suite
- * Copyright (c) 2018 Liam R. Howlett <Liam.Howlett@Oracle.com>
+ * maple_tree.c: Userspace testing for maple tree test-suite
+ * Copyright (c) 2018-2022 Oracle Corporation
+ * Author: Liam R. Howlett <Liam.Howlett@Oracle.com>
*/
#define CONFIG_DEBUG_MAPLE_TREE
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index a1fa2eff8192..af490acc5348 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -8,5 +8,6 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
TEST_PROGS += debugfs_duplicate_context_creation.sh
TEST_PROGS += sysfs.sh
+TEST_PROGS += reclaim.sh lru_sort.sh
include ../lib.mk
diff --git a/tools/testing/selftests/damon/lru_sort.sh b/tools/testing/selftests/damon/lru_sort.sh
new file mode 100644
index 000000000000..61b80197c896
--- /dev/null
+++ b/tools/testing/selftests/damon/lru_sort.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+ echo "Run as root"
+ exit $ksft_skip
+fi
+
+damon_lru_sort_enabled="/sys/module/damon_lru_sort/parameters/enabled"
+if [ ! -f "$damon_lru_sort_enabled" ]
+then
+ echo "No 'enabled' file. Maybe DAMON_LRU_SORT not built"
+ exit $ksft_skip
+fi
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+ echo "Another kdamond is running"
+ exit $ksft_skip
+fi
+
+echo Y > "$damon_lru_sort_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 1 ]
+then
+ echo "kdamond is not turned on"
+ exit 1
+fi
+
+echo N > "$damon_lru_sort_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+ echo "kdamond is not turned off"
+ exit 1
+fi
diff --git a/tools/testing/selftests/damon/reclaim.sh b/tools/testing/selftests/damon/reclaim.sh
new file mode 100644
index 000000000000..78dbc2334cbe
--- /dev/null
+++ b/tools/testing/selftests/damon/reclaim.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+ echo "Run as root"
+ exit $ksft_skip
+fi
+
+damon_reclaim_enabled="/sys/module/damon_reclaim/parameters/enabled"
+if [ ! -f "$damon_reclaim_enabled" ]
+then
+ echo "No 'enabled' file. Maybe DAMON_RECLAIM not built"
+ exit $ksft_skip
+fi
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+ echo "Another kdamond is running"
+ exit $ksft_skip
+fi
+
+echo Y > "$damon_reclaim_enabled"
+
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 1 ]
+then
+ echo "kdamond is not turned on"
+ exit 1
+fi
+
+echo N > "$damon_reclaim_enabled"
+nr_kdamonds=$(pgrep kdamond | wc -l)
+if [ "$nr_kdamonds" -ne 0 ]
+then
+ echo "kdamond is not turned off"
+ exit 1
+fi
diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c
index e7ceabed7f51..7d0aa22bdc12 100644
--- a/tools/testing/selftests/proc/proc-uptime-002.c
+++ b/tools/testing/selftests/proc/proc-uptime-002.c
@@ -17,6 +17,7 @@
// while shifting across CPUs.
#undef NDEBUG
#include <assert.h>
+#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <stdlib.h>
@@ -54,7 +55,7 @@ int main(void)
len += sizeof(unsigned long);
free(m);
m = malloc(len);
- } while (sys_sched_getaffinity(0, len, m) == -EINVAL);
+ } while (sys_sched_getaffinity(0, len, m) == -1 && errno == EINVAL);
fd = open("/proc/uptime", O_RDONLY);
assert(fd >= 0);
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
index 7b9dc2426f18..8a536c731e3c 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+anon_cow
hugepage-mmap
hugepage-mremap
hugepage-shm
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 163c2fde3cb3..0986bd60c19f 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: GPL-2.0
# Makefile for vm selftests
-LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h
+LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
+
+include local_config.mk
uname_M := $(shell uname -m 2>/dev/null || echo not)
MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
@@ -25,7 +27,8 @@ MAKEFLAGS += --no-builtin-rules
CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
LDLIBS = -lrt -lpthread
-TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES = anon_cow
+TEST_GEN_FILES += compaction_test
TEST_GEN_FILES += gup_test
TEST_GEN_FILES += hmm-tests
TEST_GEN_FILES += hugetlb-madvise
@@ -52,6 +55,7 @@ TEST_GEN_FILES += userfaultfd
TEST_GEN_PROGS += soft-dirty
TEST_GEN_PROGS += split_huge_page_test
TEST_GEN_FILES += ksm_tests
+TEST_GEN_PROGS += ksm_functional_tests
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
@@ -95,7 +99,9 @@ TEST_FILES += va_128TBswitch.sh
include ../lib.mk
+$(OUTPUT)/anon_cow: vm_util.c
$(OUTPUT)/khugepaged: vm_util.c
+$(OUTPUT)/ksm_functional_tests: vm_util.c
$(OUTPUT)/madv_populate: vm_util.c
$(OUTPUT)/soft-dirty: vm_util.c
$(OUTPUT)/split_huge_page_test: vm_util.c
@@ -150,8 +156,25 @@ warn_32bit_failure:
endif
endif
+# ANON_COW_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/anon_cow: LDLIBS += $(ANON_COW_EXTRA_LIBS)
+
$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
$(OUTPUT)/migration: LDLIBS += -lnuma
+
+local_config.mk local_config.h: check_config.sh
+ /bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(ANON_COW_EXTRA_LIBS),)
+all: warn_missing_liburing
+
+warn_missing_liburing:
+ @echo ; \
+ echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
+ echo
+endif
diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c
new file mode 100644
index 000000000000..705bd0b3db11
--- /dev/null
+++ b/tools/testing/selftests/vm/anon_cow.c
@@ -0,0 +1,1126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * COW (Copy On Write) tests for anonymous memory.
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+
+static size_t pagesize;
+static int pagemap_fd;
+static size_t thpsize;
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+
+static void detect_thpsize(void)
+{
+ int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
+ O_RDONLY);
+ size_t size = 0;
+ char buf[15];
+ int ret;
+
+ if (fd < 0)
+ return;
+
+ ret = pread(fd, buf, sizeof(buf), 0);
+ if (ret > 0 && ret < sizeof(buf)) {
+ buf[ret] = 0;
+
+ size = strtoul(buf, NULL, 10);
+ if (size < pagesize)
+ size = 0;
+ if (size > 0) {
+ thpsize = size;
+ ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+ thpsize / 1024);
+ }
+ }
+
+ close(fd);
+}
+
+static void detect_hugetlbsizes(void)
+{
+ DIR *dir = opendir("/sys/kernel/mm/hugepages/");
+
+ if (!dir)
+ return;
+
+ while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
+ struct dirent *entry = readdir(dir);
+ size_t kb;
+
+ if (!entry)
+ break;
+ if (entry->d_type != DT_DIR)
+ continue;
+ if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
+ continue;
+ hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
+ nr_hugetlbsizes++;
+ ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
+ kb);
+ }
+ closedir(dir);
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+ for (; size; addr += pagesize, size -= pagesize)
+ if (!pagemap_is_swapped(pagemap_fd, addr))
+ return false;
+ return true;
+}
+
+struct comm_pipes {
+ int child_ready[2];
+ int parent_ready[2];
+};
+
+static int setup_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ if (pipe(comm_pipes->child_ready) < 0)
+ return -errno;
+ if (pipe(comm_pipes->parent_ready) < 0) {
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ return -errno;
+ }
+
+ return 0;
+}
+
+static void close_comm_pipes(struct comm_pipes *comm_pipes)
+{
+ close(comm_pipes->child_ready[0]);
+ close(comm_pipes->child_ready[1]);
+ close(comm_pipes->parent_ready[0]);
+ close(comm_pipes->parent_ready[1]);
+}
+
+static int child_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ char *old = malloc(size);
+ char buf;
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ /* Wait until the parent modified the page. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values. */
+ return memcmp(old, mem, size);
+}
+
+static int child_vmsplice_memcmp_fn(char *mem, size_t size,
+ struct comm_pipes *comm_pipes)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred;
+ char *old, *new;
+ int fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ /* Backup the original content. */
+ memcpy(old, mem, size);
+
+ if (pipe(fds) < 0)
+ return -errno;
+
+ /* Trigger a read-only pin. */
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred < 0)
+ return -errno;
+ if (transferred == 0)
+ return -EINVAL;
+
+ /* Unmap it from our page tables. */
+ if (munmap(mem, size) < 0)
+ return -errno;
+
+ /* Wait until the parent modified it. */
+ write(comm_pipes->child_ready[1], "0", 1);
+ while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+ ;
+
+ /* See if we still read the old values via the pipe. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0)
+ return -errno;
+ }
+
+ return memcmp(old, new, transferred);
+}
+
+typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
+
+static void do_test_cow_in_parent(char *mem, size_t size, child_fn fn)
+{
+ struct comm_pipes comm_pipes;
+ char buf;
+ int ret;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ return;
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_comm_pipes;
+ } else if (!ret) {
+ exit(fn(mem, size, &comm_pipes));
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ wait(&ret);
+ if (WIFEXITED(ret))
+ ret = WEXITSTATUS(ret);
+ else
+ ret = -EINVAL;
+
+ ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_cow_in_parent(char *mem, size_t size)
+{
+ do_test_cow_in_parent(mem, size, child_memcmp_fn);
+}
+
+static void test_vmsplice_in_child(char *mem, size_t size)
+{
+ do_test_cow_in_parent(mem, size, child_vmsplice_memcmp_fn);
+}
+
+static void do_test_vmsplice_in_parent(char *mem, size_t size,
+ bool before_fork)
+{
+ struct iovec iov = {
+ .iov_base = mem,
+ .iov_len = size,
+ };
+ ssize_t cur, total, transferred;
+ struct comm_pipes comm_pipes;
+ char *old, *new;
+ int ret, fds[2];
+ char buf;
+
+ old = malloc(size);
+ new = malloc(size);
+
+ memcpy(old, mem, size);
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto free;
+ }
+
+ if (pipe(fds) < 0) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto close_comm_pipes;
+ }
+
+ if (before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_test_result_fail("vmsplice() failed\n");
+ goto close_pipe;
+ }
+ }
+
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_pipe;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ /* Modify page content in the child. */
+ memset(mem, 0xff, size);
+ exit(0);
+ }
+
+ if (!before_fork) {
+ transferred = vmsplice(fds[1], &iov, 1, 0);
+ if (transferred <= 0) {
+ ksft_test_result_fail("vmsplice() failed\n");
+ wait(&ret);
+ goto close_pipe;
+ }
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ if (munmap(mem, size) < 0) {
+ ksft_test_result_fail("munmap() failed\n");
+ goto close_pipe;
+ }
+ write(comm_pipes.parent_ready[1], "0", 1);
+
+ /* Wait until the child is done writing. */
+ wait(&ret);
+ if (!WIFEXITED(ret)) {
+ ksft_test_result_fail("wait() failed\n");
+ goto close_pipe;
+ }
+
+ /* See if we still read the old values. */
+ for (total = 0; total < transferred; total += cur) {
+ cur = read(fds[0], new + total, transferred - total);
+ if (cur < 0) {
+ ksft_test_result_fail("read() failed\n");
+ goto close_pipe;
+ }
+ }
+
+ ksft_test_result(!memcmp(old, new, transferred),
+ "No leak from child into parent\n");
+close_pipe:
+ close(fds[0]);
+ close(fds[1]);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free:
+ free(old);
+ free(new);
+}
+
+static void test_vmsplice_before_fork(char *mem, size_t size)
+{
+ do_test_vmsplice_in_parent(mem, size, true);
+}
+
+static void test_vmsplice_after_fork(char *mem, size_t size)
+{
+ do_test_vmsplice_in_parent(mem, size, false);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void do_test_iouring(char *mem, size_t size, bool use_fork)
+{
+ struct comm_pipes comm_pipes;
+ struct io_uring_cqe *cqe;
+ struct io_uring_sqe *sqe;
+ struct io_uring ring;
+ ssize_t cur, total;
+ struct iovec iov;
+ char *buf, *tmp;
+ int ret, fd;
+ FILE *file;
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ return;
+ }
+
+ file = tmpfile();
+ if (!file) {
+ ksft_test_result_fail("tmpfile() failed\n");
+ goto close_comm_pipes;
+ }
+ fd = fileno(file);
+ assert(fd);
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_test_result_fail("malloc() failed\n");
+ goto close_file;
+ }
+
+ /* Skip on errors, as we might just lack kernel support. */
+ ret = io_uring_queue_init(1, &ring, 0);
+ if (ret < 0) {
+ ksft_test_result_skip("io_uring_queue_init() failed\n");
+ goto free_tmp;
+ }
+
+ /*
+ * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
+ * | FOLL_LONGTERM the range.
+ *
+ * Skip on errors, as we might just lack kernel support or might not
+ * have sufficient MEMLOCK permissions.
+ */
+ iov.iov_base = mem;
+ iov.iov_len = size;
+ ret = io_uring_register_buffers(&ring, &iov, 1);
+ if (ret) {
+ ksft_test_result_skip("io_uring_register_buffers() failed\n");
+ goto queue_exit;
+ }
+
+ if (use_fork) {
+ /*
+ * fork() and keep the child alive until we're done. Note that
+ * we expect the pinned page to not get shared with the child.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto unregister_buffers;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+ } else {
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ clear_softdirty();
+ ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto unregister_buffers;
+ }
+ }
+
+ /*
+ * Modify the page and write page content as observed by the fixed
+ * buffer pin to the file so we can verify it.
+ */
+ memset(mem, 0xff, size);
+ sqe = io_uring_get_sqe(&ring);
+ if (!sqe) {
+ ksft_test_result_fail("io_uring_get_sqe() failed\n");
+ goto quit_child;
+ }
+ io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
+
+ ret = io_uring_submit(&ring);
+ if (ret < 0) {
+ ksft_test_result_fail("io_uring_submit() failed\n");
+ goto quit_child;
+ }
+
+ ret = io_uring_wait_cqe(&ring, &cqe);
+ if (ret < 0) {
+ ksft_test_result_fail("io_uring_wait_cqe() failed\n");
+ goto quit_child;
+ }
+
+ if (cqe->res != size) {
+ ksft_test_result_fail("write_fixed failed\n");
+ goto quit_child;
+ }
+ io_uring_cqe_seen(&ring, cqe);
+
+ /* Read back the file content to the temporary buffer. */
+ total = 0;
+ while (total < size) {
+ cur = pread(fd, tmp + total, size - total, total);
+ if (cur < 0) {
+ ksft_test_result_fail("pread() failed\n");
+ goto quit_child;
+ }
+ total += cur;
+ }
+
+ /* Finally, check if we read what we expected. */
+ ksft_test_result(!memcmp(mem, tmp, size),
+ "Longterm R/W pin is reliable\n");
+
+quit_child:
+ if (use_fork) {
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ }
+unregister_buffers:
+ io_uring_unregister_buffers(&ring);
+queue_exit:
+ io_uring_queue_exit(&ring);
+free_tmp:
+ free(tmp);
+close_file:
+ fclose(file);
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+}
+
+static void test_iouring_ro(char *mem, size_t size)
+{
+ do_test_iouring(mem, size, false);
+}
+
+static void test_iouring_fork(char *mem, size_t size)
+{
+ do_test_iouring(mem, size, true);
+}
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+enum ro_pin_test {
+ RO_PIN_TEST_SHARED,
+ RO_PIN_TEST_PREVIOUSLY_SHARED,
+ RO_PIN_TEST_RO_EXCLUSIVE,
+};
+
+static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
+ bool fast)
+{
+ struct pin_longterm_test args;
+ struct comm_pipes comm_pipes;
+ char *tmp, buf;
+ __u64 tmp_val;
+ int ret;
+
+ if (gup_fd < 0) {
+ ksft_test_result_skip("gup_test not available\n");
+ return;
+ }
+
+ tmp = malloc(size);
+ if (!tmp) {
+ ksft_test_result_fail("malloc() failed\n");
+ return;
+ }
+
+ ret = setup_comm_pipes(&comm_pipes);
+ if (ret) {
+ ksft_test_result_fail("pipe() failed\n");
+ goto free_tmp;
+ }
+
+ switch (test) {
+ case RO_PIN_TEST_SHARED:
+ case RO_PIN_TEST_PREVIOUSLY_SHARED:
+ /*
+ * Share the pages with our child. As the pages are not pinned,
+ * this should just work.
+ */
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto close_comm_pipes;
+ } else if (!ret) {
+ write(comm_pipes.child_ready[1], "0", 1);
+ while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+ ;
+ exit(0);
+ }
+
+ /* Wait until our child is ready. */
+ while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+ ;
+
+ if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
+ /*
+ * Tell the child to quit now and wait until it quit.
+ * The pages should now be mapped R/O into our page
+ * tables, but they are no longer shared.
+ */
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_print_msg("[INFO] wait() failed\n");
+ }
+ break;
+ case RO_PIN_TEST_RO_EXCLUSIVE:
+ /*
+ * Map the page R/O into the page table. Enable softdirty
+ * tracking to stop the page from getting mapped R/W immediately
+ * again by mprotect() optimizations. Note that we don't have an
+ * easy way to test if that worked (the pagemap does not export
+ * if the page is mapped R/O vs. R/W).
+ */
+ ret = mprotect(mem, size, PROT_READ);
+ clear_softdirty();
+ ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto close_comm_pipes;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ /* Take a R/O pin. This should trigger unsharing. */
+ args.addr = (__u64)mem;
+ args.size = size;
+ args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+ if (ret) {
+ if (errno == EINVAL)
+ ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
+ else
+ ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
+ goto wait;
+ }
+
+ /* Modify the page. */
+ memset(mem, 0xff, size);
+
+ /*
+ * Read back the content via the pin to the temporary buffer and
+ * test if we observed the modification.
+ */
+ tmp_val = (__u64)tmp;
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
+ if (ret)
+ ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
+ else
+ ksft_test_result(!memcmp(mem, tmp, size),
+ "Longterm R/O pin is reliable\n");
+
+ ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
+ if (ret)
+ ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
+wait:
+ switch (test) {
+ case RO_PIN_TEST_SHARED:
+ write(comm_pipes.parent_ready[1], "0", 1);
+ wait(&ret);
+ if (!WIFEXITED(ret))
+ ksft_print_msg("[INFO] wait() failed\n");
+ break;
+ default:
+ break;
+ }
+close_comm_pipes:
+ close_comm_pipes(&comm_pipes);
+free_tmp:
+ free(tmp);
+}
+
+static void test_ro_pin_on_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
+}
+
+static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
+{
+ do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
+}
+
+typedef void (*test_fn)(char *mem, size_t size);
+
+static void do_run_with_base_page(test_fn fn, bool swapout)
+{
+ char *mem;
+ int ret;
+
+ mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
+ /* Ignore if not around on a kernel. */
+ if (ret && errno != EINVAL) {
+ ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ goto munmap;
+ }
+
+ /* Populate a base page. */
+ memset(mem, 0, pagesize);
+
+ if (swapout) {
+ madvise(mem, pagesize, MADV_PAGEOUT);
+ if (!pagemap_is_swapped(pagemap_fd, mem)) {
+ ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ goto munmap;
+ }
+ }
+
+ fn(mem, pagesize);
+munmap:
+ munmap(mem, pagesize);
+}
+
+static void run_with_base_page(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with base page\n", desc);
+ do_run_with_base_page(fn, false);
+}
+
+static void run_with_base_page_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
+ do_run_with_base_page(fn, true);
+}
+
+enum thp_run {
+ THP_RUN_PMD,
+ THP_RUN_PMD_SWAPOUT,
+ THP_RUN_PTE,
+ THP_RUN_PTE_SWAPOUT,
+ THP_RUN_SINGLE_PTE,
+ THP_RUN_SINGLE_PTE_SWAPOUT,
+ THP_RUN_PARTIAL_MREMAP,
+ THP_RUN_PARTIAL_SHARED,
+};
+
+static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
+{
+ char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
+ size_t size, mmap_size, mremap_size;
+ int ret;
+
+ /* For alignment purposes, we need twice the thp size. */
+ mmap_size = 2 * thpsize;
+ mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mmap_mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return;
+ }
+
+ /* We need a THP-aligned memory area. */
+ mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+ ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+ if (ret) {
+ ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+ goto munmap;
+ }
+
+ /*
+ * Try to populate a THP. Touch the first sub-page and test if we get
+ * another sub-page populated automatically.
+ */
+ mem[0] = 0;
+ if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
+ ksft_test_result_skip("Did not get a THP populated\n");
+ goto munmap;
+ }
+ memset(mem, 0, thpsize);
+
+ size = thpsize;
+ switch (thp_run) {
+ case THP_RUN_PMD:
+ case THP_RUN_PMD_SWAPOUT:
+ break;
+ case THP_RUN_PTE:
+ case THP_RUN_PTE_SWAPOUT:
+ /*
+ * Trigger PTE-mapping the THP by temporarily mapping a single
+ * subpage R/O.
+ */
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+ ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+ if (ret) {
+ ksft_test_result_fail("mprotect() failed\n");
+ goto munmap;
+ }
+ break;
+ case THP_RUN_SINGLE_PTE:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ /*
+ * Discard all but a single subpage of that PTE-mapped THP. What
+ * remains is a single PTE mapping a single subpage.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTNEED failed\n");
+ goto munmap;
+ }
+ size = pagesize;
+ break;
+ case THP_RUN_PARTIAL_MREMAP:
+ /*
+ * Remap half of the THP. We need some new memory location
+ * for that.
+ */
+ mremap_size = thpsize / 2;
+ mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ goto munmap;
+ }
+ tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
+ if (tmp != mremap_mem) {
+ ksft_test_result_fail("mremap() failed\n");
+ goto munmap;
+ }
+ size = mremap_size;
+ break;
+ case THP_RUN_PARTIAL_SHARED:
+ /*
+ * Share the first page of the THP with a child and quit the
+ * child. This will result in some parts of the THP never
+ * have been shared.
+ */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DONTFORK failed\n");
+ goto munmap;
+ }
+ ret = fork();
+ if (ret < 0) {
+ ksft_test_result_fail("fork() failed\n");
+ goto munmap;
+ } else if (!ret) {
+ exit(0);
+ }
+ wait(&ret);
+ /* Allow for sharing all pages again. */
+ ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
+ if (ret) {
+ ksft_test_result_fail("MADV_DOFORK failed\n");
+ goto munmap;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ switch (thp_run) {
+ case THP_RUN_PMD_SWAPOUT:
+ case THP_RUN_PTE_SWAPOUT:
+ case THP_RUN_SINGLE_PTE_SWAPOUT:
+ madvise(mem, size, MADV_PAGEOUT);
+ if (!range_is_swapped(mem, size)) {
+ ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+ goto munmap;
+ }
+ break;
+ default:
+ break;
+ }
+
+ fn(mem, size);
+munmap:
+ munmap(mmap_mem, mmap_size);
+ if (mremap_mem != MAP_FAILED)
+ munmap(mremap_mem, mremap_size);
+}
+
+static void run_with_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PMD);
+}
+
+static void run_with_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
+}
+
+static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PTE);
+}
+
+static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
+}
+
+static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
+}
+
+static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
+}
+
+static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
+}
+
+static void run_with_partial_shared_thp(test_fn fn, const char *desc)
+{
+ ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
+ do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
+}
+
+static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
+{
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+ char *mem, *dummy;
+
+ ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
+ hugetlbsize / 1024);
+
+ flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
+
+ mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (mem == MAP_FAILED) {
+ ksft_test_result_skip("need more free huge pages\n");
+ return;
+ }
+
+ /* Populate an huge page. */
+ memset(mem, 0, hugetlbsize);
+
+ /*
+ * We need a total of two hugetlb pages to handle COW/unsharing
+ * properly, otherwise we might get zapped by a SIGBUS.
+ */
+ dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (dummy == MAP_FAILED) {
+ ksft_test_result_skip("need more free huge pages\n");
+ goto munmap;
+ }
+ munmap(dummy, hugetlbsize);
+
+ fn(mem, hugetlbsize);
+munmap:
+ munmap(mem, hugetlbsize);
+}
+
+struct test_case {
+ const char *desc;
+ test_fn fn;
+};
+
+static const struct test_case test_cases[] = {
+ /*
+ * Basic COW tests for fork() without any GUP. If we miss to break COW,
+ * either the child can observe modifications by the parent or the
+ * other way around.
+ */
+ {
+ "Basic COW after fork()",
+ test_cow_in_parent,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
+ * we miss to break COW, the child observes modifications by the parent.
+ * This is CVE-2020-29374 reported by Jann Horn.
+ */
+ {
+ "vmsplice() + unmap in child",
+ test_vmsplice_in_child
+ },
+ /*
+ * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
+ * fork(); modify in the child. If we miss to break COW, the parent
+ * observes modifications by the child.
+ */
+ {
+ "vmsplice() before fork(), unmap in parent after fork()",
+ test_vmsplice_before_fork,
+ },
+ /*
+ * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
+ * child. If we miss to break COW, the parent observes modifications by
+ * the child.
+ */
+ {
+ "vmsplice() + unmap in parent after fork()",
+ test_vmsplice_after_fork,
+ },
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+ /*
+ * Take a R/W longterm pin and then map the page R/O into the page
+ * table to trigger a write fault on next access. When modifying the
+ * page, the page content must be visible via the pin.
+ */
+ {
+ "R/O-mapping a page registered as iouring fixed buffer",
+ test_iouring_ro,
+ },
+ /*
+ * Take a R/W longterm pin and then fork() a child. When modifying the
+ * page, the page content must be visible via the pin. We expect the
+ * pinned page to not get shared with the child.
+ */
+ {
+ "fork() with an iouring fixed buffer",
+ test_iouring_fork,
+ },
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+ /*
+ * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped shared page",
+ test_ro_pin_on_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped shared page",
+ test_ro_fast_pin_on_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
+ * was previously shared. When modifying the page via the page table,
+ * the page content change must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped previously-shared page",
+ test_ro_pin_on_ro_previously_shared,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped previously-shared page",
+ test_ro_fast_pin_on_ro_previously_shared,
+ },
+ /*
+ * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
+ * When modifying the page via the page table, the page content change
+ * must be visible via the pin.
+ */
+ {
+ "R/O GUP pin on R/O-mapped exclusive page",
+ test_ro_pin_on_ro_exclusive,
+ },
+ /* Same as above, but using GUP-fast. */
+ {
+ "R/O GUP-fast pin on R/O-mapped exclusive page",
+ test_ro_fast_pin_on_ro_exclusive,
+ },
+};
+
+static void run_test_case(struct test_case const *test_case)
+{
+ int i;
+
+ run_with_base_page(test_case->fn, test_case->desc);
+ run_with_base_page_swap(test_case->fn, test_case->desc);
+ if (thpsize) {
+ run_with_thp(test_case->fn, test_case->desc);
+ run_with_thp_swap(test_case->fn, test_case->desc);
+ run_with_pte_mapped_thp(test_case->fn, test_case->desc);
+ run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
+ run_with_single_pte_of_thp(test_case->fn, test_case->desc);
+ run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
+ run_with_partial_mremap_thp(test_case->fn, test_case->desc);
+ run_with_partial_shared_thp(test_case->fn, test_case->desc);
+ }
+ for (i = 0; i < nr_hugetlbsizes; i++)
+ run_with_hugetlb(test_case->fn, test_case->desc,
+ hugetlbsizes[i]);
+}
+
+static void run_test_cases(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+ run_test_case(&test_cases[i]);
+}
+
+static int tests_per_test_case(void)
+{
+ int tests = 2 + nr_hugetlbsizes;
+
+ if (thpsize)
+ tests += 8;
+ return tests;
+}
+
+int main(int argc, char **argv)
+{
+ int nr_test_cases = ARRAY_SIZE(test_cases);
+ int err;
+
+ pagesize = getpagesize();
+ detect_thpsize();
+ detect_hugetlbsizes();
+
+ ksft_print_header();
+ ksft_set_plan(nr_test_cases * tests_per_test_case());
+
+ gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_fail_msg("opening pagemap failed\n");
+
+ run_test_cases();
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
new file mode 100644
index 000000000000..9a44c6520925
--- /dev/null
+++ b/tools/testing/selftests/vm/check_config.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+# liburing
+echo "#include <sys/types.h>" > $tmpfile_c
+echo "#include <liburing.h>" >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+ echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE
+ echo "ANON_COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE
+else
+ echo "// No liburing support found" > $OUTPUT_H_FILE
+ echo "# No liburing support found, so:" > $OUTPUT_MKFILE
+ echo "ANON_COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
index e63a0214f639..e53b5eaa8fce 100644
--- a/tools/testing/selftests/vm/hugepage-mremap.c
+++ b/tools/testing/selftests/vm/hugepage-mremap.c
@@ -22,6 +22,7 @@
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
+#include <string.h>
#define DEFAULT_LENGTH_MB 10UL
#define MB_TO_BYTES(x) (x * 1024 * 1024)
@@ -108,26 +109,23 @@ static void register_region_with_uffd(char *addr, size_t len)
int main(int argc, char *argv[])
{
size_t length = 0;
+ int ret = 0, fd;
- if (argc != 2 && argc != 3) {
- printf("Usage: %s [length_in_MB] <hugetlb_file>\n", argv[0]);
+ if (argc >= 2 && !strcmp(argv[1], "-h")) {
+ printf("Usage: %s [length_in_MB]\n", argv[0]);
exit(1);
}
/* Read memory length as the first arg if valid, otherwise fallback to
* the default length.
*/
- if (argc == 3)
- length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL;
+ if (argc >= 2)
+ length = (size_t)atoi(argv[1]);
+ else
+ length = DEFAULT_LENGTH_MB;
- length = length > 0 ? length : DEFAULT_LENGTH_MB;
length = MB_TO_BYTES(length);
-
- int ret = 0;
-
- /* last arg is the hugetlb file name */
- int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755);
-
+ fd = memfd_create(argv[0], MFD_HUGETLB);
if (fd < 0) {
perror("Open failed");
exit(1);
@@ -185,7 +183,6 @@ int main(int argc, char *argv[])
}
close(fd);
- unlink(argv[argc-1]);
return ret;
}
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c
index 3c9943131881..f96435b70986 100644
--- a/tools/testing/selftests/vm/hugetlb-madvise.c
+++ b/tools/testing/selftests/vm/hugetlb-madvise.c
@@ -12,6 +12,7 @@
* directory.
*/
+#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
@@ -19,7 +20,6 @@
#define __USE_GNU
#include <fcntl.h>
-#define USAGE "USAGE: %s <hugepagefile_name>\n"
#define MIN_FREE_PAGES 20
#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */
@@ -103,11 +103,6 @@ int main(int argc, char **argv)
int fd;
int ret;
- if (argc != 2) {
- printf(USAGE, argv[0]);
- exit(1);
- }
-
huge_page_size = default_huge_page_size();
if (!huge_page_size) {
printf("Unable to determine huge page size, exiting!\n");
@@ -125,9 +120,9 @@ int main(int argc, char **argv)
exit(1);
}
- fd = open(argv[1], O_CREAT | O_RDWR, 0755);
+ fd = memfd_create(argv[0], MFD_HUGETLB);
if (fd < 0) {
- perror("Open failed");
+ perror("memfd_create() failed");
exit(1);
}
@@ -406,6 +401,5 @@ int main(int argc, char **argv)
(void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
close(fd);
- unlink(argv[1]);
return 0;
}
diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
new file mode 100644
index 000000000000..96644be68962
--- /dev/null
+++ b/tools/testing/selftests/vm/ksm_functional_tests.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KSM functional tests
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define KiB 1024u
+#define MiB (1024 * KiB)
+
+static int ksm_fd;
+static int ksm_full_scans_fd;
+static int pagemap_fd;
+static size_t pagesize;
+
+static bool range_maps_duplicates(char *addr, unsigned long size)
+{
+ unsigned long offs_a, offs_b, pfn_a, pfn_b;
+
+ /*
+ * There is no easy way to check if there are KSM pages mapped into
+ * this range. We only check that the range does not map the same PFN
+ * twice by comaring each pair of mapped pages.
+ */
+ for (offs_a = 0; offs_a < size; offs_a += pagesize) {
+ pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
+ /* Page not present or PFN not exposed by the kernel. */
+ if (pfn_a == -1ull || !pfn_a)
+ continue;
+
+ for (offs_b = offs_a + pagesize; offs_b < size;
+ offs_b += pagesize) {
+ pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
+ if (pfn_b == -1ull || !pfn_b)
+ continue;
+ if (pfn_a == pfn_b)
+ return true;
+ }
+ }
+ return false;
+}
+
+static long ksm_get_full_scans(void)
+{
+ char buf[10];
+ ssize_t ret;
+
+ ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+static int ksm_merge(void)
+{
+ long start_scans, end_scans;
+
+ /* Wait for two full scans such that any possible merging happened. */
+ start_scans = ksm_get_full_scans();
+ if (start_scans < 0)
+ return start_scans;
+ if (write(ksm_fd, "1", 1) != 1)
+ return -errno;
+ do {
+ end_scans = ksm_get_full_scans();
+ if (end_scans < 0)
+ return end_scans;
+ } while (end_scans < start_scans + 2);
+
+ return 0;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size)
+{
+ char *map;
+
+ map = mmap(NULL, size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (map == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return MAP_FAILED;
+ }
+
+ /* Don't use THP. Ignore if THP are not around on a kernel. */
+ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
+ ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ goto unmap;
+ }
+
+ /* Make sure each page contains the same values to merge them. */
+ memset(map, val, size);
+ if (madvise(map, size, MADV_MERGEABLE)) {
+ ksft_test_result_fail("MADV_MERGEABLE failed\n");
+ goto unmap;
+ }
+
+ /* Run KSM to trigger merging and wait. */
+ if (ksm_merge()) {
+ ksft_test_result_fail("Running KSM failed\n");
+ goto unmap;
+ }
+ return map;
+unmap:
+ munmap(map, size);
+ return MAP_FAILED;
+}
+
+static void test_unmerge(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void test_unmerge_discarded(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ /* Discard half of all mapped pages so we have pte_none() entries. */
+ if (madvise(map, size / 2, MADV_DONTNEED)) {
+ ksft_test_result_fail("MADV_DONTNEED failed\n");
+ goto unmap;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_unmerge_uffd_wp(void)
+{
+ struct uffdio_writeprotect uffd_writeprotect;
+ struct uffdio_register uffdio_register;
+ const unsigned int size = 2 * MiB;
+ struct uffdio_api uffdio_api;
+ char *map;
+ int uffd;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ /* See if UFFD is around. */
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ ksft_test_result_skip("__NR_userfaultfd failed\n");
+ goto unmap;
+ }
+
+ /* See if UFFD-WP is around. */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+ ksft_test_result_fail("UFFDIO_API failed\n");
+ goto close_uffd;
+ }
+ if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
+ ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
+ goto close_uffd;
+ }
+
+ /* Register UFFD-WP, no need for an actual handler. */
+ uffdio_register.range.start = (unsigned long) map;
+ uffdio_register.range.len = size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
+ ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
+ goto close_uffd;
+ }
+
+ /* Write-protect the range using UFFD-WP. */
+ uffd_writeprotect.range.start = (unsigned long) map;
+ uffd_writeprotect.range.len = size;
+ uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+ if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+ ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
+ goto close_uffd;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto close_uffd;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+close_uffd:
+ close(uffd);
+unmap:
+ munmap(map, size);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ unsigned int tests = 2;
+ int err;
+
+#ifdef __NR_userfaultfd
+ tests++;
+#endif
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ pagesize = getpagesize();
+
+ ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+ if (ksm_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
+ ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+ if (ksm_full_scans_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+
+ test_unmerge();
+ test_unmerge_discarded();
+#ifdef __NR_userfaultfd
+ test_unmerge_uffd_wp();
+#endif
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
index 0d85be2350fa..f9eb4d67e0dd 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -40,6 +40,7 @@ enum ksm_test_name {
CHECK_KSM_NUMA_MERGE,
KSM_MERGE_TIME,
KSM_MERGE_TIME_HUGE_PAGES,
+ KSM_UNMERGE_TIME,
KSM_COW_TIME
};
@@ -108,7 +109,10 @@ static void print_help(void)
" -P evaluate merging time and speed.\n"
" For this test, the size of duplicated memory area (in MiB)\n"
" must be provided using -s option\n"
- " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+ " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -D evaluate unmerging time and speed when disabling KSM.\n"
" For this test, the size of duplicated memory area (in MiB)\n"
" must be provided using -s option\n"
" -C evaluate the time required to break COW of merged pages.\n\n");
@@ -188,6 +192,16 @@ static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time,
return 0;
}
+static int ksm_unmerge_pages(void *addr, size_t size,
+ struct timespec start_time, int timeout)
+{
+ if (madvise(addr, size, MADV_UNMERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ return 0;
+}
+
static bool assert_ksm_pages_count(long dupl_page_count)
{
unsigned long max_page_sharing, pages_sharing, pages_shared;
@@ -560,6 +574,53 @@ err_out:
return KSFT_FAIL;
}
+static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+
+ map_size *= MB;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+ if (!map_ptr)
+ return KSFT_FAIL;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, map_size);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, map_size);
+ return KSFT_FAIL;
+}
+
static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
{
void *map_ptr;
@@ -644,7 +705,7 @@ int main(int argc, char *argv[])
bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
long size_MB = 0;
- while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) {
+ while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
switch (opt) {
case 'a':
prot = str_to_prot(optarg);
@@ -701,6 +762,9 @@ int main(int argc, char *argv[])
case 'H':
test_name = KSM_MERGE_TIME_HUGE_PAGES;
break;
+ case 'D':
+ test_name = KSM_UNMERGE_TIME;
+ break;
case 'C':
test_name = KSM_COW_TIME;
break;
@@ -762,6 +826,14 @@ int main(int argc, char *argv[])
ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
ksm_scan_limit_sec, size_MB);
break;
+ case KSM_UNMERGE_TIME:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
+ break;
case KSM_COW_TIME:
ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
page_size);
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c
index 715a42e8e2cd..60547245e479 100644
--- a/tools/testing/selftests/vm/madv_populate.c
+++ b/tools/testing/selftests/vm/madv_populate.c
@@ -27,14 +27,6 @@
static size_t pagesize;
-static bool pagemap_is_populated(int fd, char *start)
-{
- uint64_t entry = pagemap_get_entry(fd, start);
-
- /* Present or swapped. */
- return entry & 0xc000000000000000ull;
-}
-
static void sense_support(void)
{
char *addr;
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index e780e76c26b8..af35dd3bc589 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -1,22 +1,86 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-#please run as root
+# Please run as root
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
-mnt=./huge
exitcode=0
-#get huge pagesize and freepages from /proc/meminfo
-while read -r name size unit; do
- if [ "$name" = "HugePages_Free:" ]; then
- freepgs="$size"
+usage() {
+ cat <<EOF
+usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
+ -t: specify specific categories to tests to run
+ -h: display this message
+
+The default behavior is to run all tests.
+
+Alternatively, specific groups tests can be run by passing a string
+to the -t argument containing one or more of the following categories
+separated by spaces:
+- mmap
+ tests for mmap(2)
+- gup_test
+ tests for gup using gup_test interface
+- userfaultfd
+ tests for userfaultfd(2)
+- compaction
+ a test for the patch "Allow compaction of unevictable pages"
+- mlock
+ tests for mlock(2)
+- mremap
+ tests for mremap(2)
+- hugevm
+ tests for very large virtual address space
+- vmalloc
+ vmalloc smoke tests
+- hmm
+ hmm smoke tests
+- madv_populate
+ test memadvise(2) MADV_POPULATE_{READ,WRITE} options
+- memfd_secret
+ test memfd_secret(2)
+- process_mrelease
+ test process_mrelease(2)
+- ksm
+ ksm tests that do not require >=2 NUMA nodes
+- ksm_numa
+ ksm tests that require >=2 NUMA nodes
+- pkey
+ memory protection key tests
+- soft_dirty
+ test soft dirty page bit semantics
+- anon_cow
+ test anonymous copy-on-write semantics
+example: ./run_vmtests.sh -t "hmm mmap ksm"
+EOF
+ exit 0
+}
+
+
+while getopts "ht:" OPT; do
+ case ${OPT} in
+ "h") usage ;;
+ "t") VM_SELFTEST_ITEMS=${OPTARG} ;;
+ esac
+done
+shift $((OPTIND -1))
+
+# default behavior: run all tests
+VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
+
+test_selected() {
+ if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
+ # If no VM_SELFTEST_ITEMS are specified, run all tests
+ return 0
fi
- if [ "$name" = "Hugepagesize:" ]; then
- hpgsize_KB="$size"
+ # If test selected argument is one of the test items
+ if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
+ return 0
+ else
+ return 1
fi
-done < /proc/meminfo
+}
# Simple hugetlbfs tests have a hardcoded minimum requirement of
# huge pages totaling 256MB (262144KB) in size. The userfaultfd
@@ -28,7 +92,17 @@ hpgsize_MB=$((hpgsize_KB / 1024))
half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
needmem_KB=$((half_ufd_size_MB * 2 * 1024))
-#set proper nr_hugepages
+# get huge pagesize and freepages from /proc/meminfo
+while read -r name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs="$size"
+ fi
+ if [ "$name" = "Hugepagesize:" ]; then
+ hpgsize_KB="$size"
+ fi
+done < /proc/meminfo
+
+# set proper nr_hugepages
if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
needpgs=$((needmem_KB / hpgsize_KB))
@@ -57,144 +131,143 @@ else
exit 1
fi
-#filter 64bit architectures
+# filter 64bit architectures
ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
if [ -z "$ARCH" ]; then
ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
fi
VADDR64=0
-echo "$ARCH64STR" | grep "$ARCH" && VADDR64=1
+echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
# Usage: run_test [test binary] [arbitrary test arguments...]
run_test() {
- local title="running $*"
- local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
- printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
-
- "$@"
- local ret=$?
- if [ $ret -eq 0 ]; then
- echo "[PASS]"
- elif [ $ret -eq $ksft_skip ]; then
- echo "[SKIP]"
- exitcode=$ksft_skip
- else
- echo "[FAIL]"
- exitcode=1
- fi
+ if test_selected ${CATEGORY}; then
+ echo "running: $1"
+ local title="running $*"
+ local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
+ printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
+
+ "$@"
+ local ret=$?
+ if [ $ret -eq 0 ]; then
+ echo "[PASS]"
+ elif [ $ret -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+ else
+ echo "[FAIL]"
+ exitcode=1
+ fi
+ fi # test_selected
}
-mkdir "$mnt"
-mount -t hugetlbfs none "$mnt"
-
-run_test ./hugepage-mmap
+CATEGORY="hugetlb" run_test ./hugepage-mmap
shmmax=$(cat /proc/sys/kernel/shmmax)
shmall=$(cat /proc/sys/kernel/shmall)
echo 268435456 > /proc/sys/kernel/shmmax
echo 4194304 > /proc/sys/kernel/shmall
-run_test ./hugepage-shm
+CATEGORY="hugetlb" run_test ./hugepage-shm
echo "$shmmax" > /proc/sys/kernel/shmmax
echo "$shmall" > /proc/sys/kernel/shmall
-run_test ./map_hugetlb
-
-run_test ./hugepage-mremap "$mnt"/huge_mremap
-rm -f "$mnt"/huge_mremap
-
-run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./map_hugetlb
+CATEGORY="hugetlb" run_test ./hugepage-mremap
+CATEGORY="hugetlb" run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./hugetlb-madvise
-run_test ./hugetlb-madvise "$mnt"/madvise-test
-rm -f "$mnt"/madvise-test
-
-echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
-echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
-echo " hugetlb regression testing."
+if test_selected "hugetlb"; then
+ echo "NOTE: These hugetlb tests provide minimal coverage. Use"
+ echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+ echo " hugetlb regression testing."
+fi
-run_test ./map_fixed_noreplace
+CATEGORY="mmap" run_test ./map_fixed_noreplace
# get_user_pages_fast() benchmark
-run_test ./gup_test -u
+CATEGORY="gup_test" run_test ./gup_test -u
# pin_user_pages_fast() benchmark
-run_test ./gup_test -a
+CATEGORY="gup_test" run_test ./gup_test -a
# Dump pages 0, 19, and 4096, using pin_user_pages:
-run_test ./gup_test -ct -F 0x1 0 19 0x1000
+CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
uffd_mods=("" ":dev")
for mod in "${uffd_mods[@]}"; do
- run_test ./userfaultfd anon${mod} 20 16
+ CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
# Hugetlb tests require source and destination huge pages. Pass in half
# the size ($half_ufd_size_MB), which is used for *each*.
- run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
- run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test
- rm -f "$mnt"/uffd-test
- run_test ./userfaultfd shmem${mod} 20 16
+ CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+ CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
+ CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
done
#cleanup
-umount "$mnt"
-rm -rf "$mnt"
echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
-run_test ./compaction_test
+CATEGORY="compaction" run_test ./compaction_test
-run_test sudo -u nobody ./on-fault-limit
+CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
-run_test ./map_populate
+CATEGORY="mmap" run_test ./map_populate
-run_test ./mlock-random-test
+CATEGORY="mlock" run_test ./mlock-random-test
-run_test ./mlock2-tests
+CATEGORY="mlock" run_test ./mlock2-tests
-run_test ./mrelease_test
+CATEGORY="process_mrelease" run_test ./mrelease_test
-run_test ./mremap_test
+CATEGORY="mremap" run_test ./mremap_test
-run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./thuge-gen
if [ $VADDR64 -ne 0 ]; then
- run_test ./virtual_address_range
+ CATEGORY="hugevm" run_test ./virtual_address_range
# virtual address 128TB switch test
- run_test ./va_128TBswitch.sh
+ CATEGORY="hugevm" run_test ./va_128TBswitch.sh
fi # VADDR64
# vmalloc stability smoke test
-run_test ./test_vmalloc.sh smoke
+CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
-run_test ./mremap_dontunmap
+CATEGORY="mremap" run_test ./mremap_dontunmap
-run_test ./test_hmm.sh smoke
+CATEGORY="hmm" run_test ./test_hmm.sh smoke
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
-run_test ./madv_populate
+CATEGORY="madv_populate" run_test ./madv_populate
-run_test ./memfd_secret
+CATEGORY="memfd_secret" run_test ./memfd_secret
# KSM MADV_MERGEABLE test with 10 identical pages
-run_test ./ksm_tests -M -p 10
+CATEGORY="ksm" run_test ./ksm_tests -M -p 10
# KSM unmerge test
-run_test ./ksm_tests -U
+CATEGORY="ksm" run_test ./ksm_tests -U
# KSM test with 10 zero pages and use_zero_pages = 0
-run_test ./ksm_tests -Z -p 10 -z 0
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
# KSM test with 10 zero pages and use_zero_pages = 1
-run_test ./ksm_tests -Z -p 10 -z 1
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
# KSM test with 2 NUMA nodes and merge_across_nodes = 1
-run_test ./ksm_tests -N -m 1
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
# KSM test with 2 NUMA nodes and merge_across_nodes = 0
-run_test ./ksm_tests -N -m 0
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
+
+run_test ./ksm_functional_tests
# protection_keys tests
if [ -x ./protection_keys_32 ]
then
- run_test ./protection_keys_32
+ CATEGORY="pkey" run_test ./protection_keys_32
fi
if [ -x ./protection_keys_64 ]
then
- run_test ./protection_keys_64
+ CATEGORY="pkey" run_test ./protection_keys_64
fi
-run_test ./soft-dirty
+CATEGORY="soft_dirty" run_test ./soft-dirty
+
+# COW tests for anonymous memory
+CATEGORY="anon_cow" run_test ./anon_cow
exit $exitcode
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 297f250c1d95..7f22844ed704 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -93,10 +93,8 @@ static volatile bool test_uffdio_zeropage_eexist = true;
static bool test_uffdio_wp = true;
/* Whether to test uffd minor faults */
static bool test_uffdio_minor = false;
-
static bool map_shared;
-static int shm_fd;
-static int huge_fd;
+static int mem_fd;
static unsigned long long *count_verify;
static int uffd = -1;
static int uffd_flags, finished, *pipefd;
@@ -143,7 +141,7 @@ const char *examples =
"# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
"./userfaultfd hugetlb 256 50\n\n"
"# Run the same hugetlb test but using shared file:\n"
- "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
+ "./userfaultfd hugetlb_shared 256 50\n\n"
"# 10MiB-~6GiB 999 bounces anonymous test, "
"continue forever unless an error triggers\n"
"while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
@@ -260,35 +258,21 @@ static void hugetlb_release_pages(char *rel_area)
static void hugetlb_allocate_area(void **alloc_area, bool is_src)
{
+ off_t size = nr_pages * page_size;
+ off_t offset = is_src ? 0 : size;
void *area_alias = NULL;
char **alloc_area_alias;
- if (!map_shared)
- *alloc_area = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
- (is_src ? 0 : MAP_NORESERVE),
- -1,
- 0);
- else
- *alloc_area = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED |
- (is_src ? 0 : MAP_NORESERVE),
- huge_fd,
- is_src ? 0 : nr_pages * page_size);
+ *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ (is_src ? 0 : MAP_NORESERVE),
+ mem_fd, offset);
if (*alloc_area == MAP_FAILED)
err("mmap of hugetlbfs file failed");
if (map_shared) {
- area_alias = mmap(NULL,
- nr_pages * page_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED,
- huge_fd,
- is_src ? 0 : nr_pages * page_size);
+ area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, mem_fd, offset);
if (area_alias == MAP_FAILED)
err("mmap of hugetlb file alias failed");
}
@@ -334,14 +318,14 @@ static void shmem_allocate_area(void **alloc_area, bool is_src)
}
*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- shm_fd, offset);
+ mem_fd, offset);
if (*alloc_area == MAP_FAILED)
err("mmap of memfd failed");
if (test_collapse && *alloc_area != p)
err("mmap of memfd failed at %p", p);
area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- shm_fd, offset);
+ mem_fd, offset);
if (area_alias == MAP_FAILED)
err("mmap of memfd alias failed");
if (test_collapse && area_alias != p_alias)
@@ -1841,21 +1825,17 @@ int main(int argc, char **argv)
}
nr_pages = nr_pages_per_cpu * nr_cpus;
- if (test_type == TEST_HUGETLB && map_shared) {
- if (argc < 5)
- usage();
- huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
- if (huge_fd < 0)
- err("Open of %s failed", argv[4]);
- if (ftruncate(huge_fd, 0))
- err("ftruncate %s to size 0 failed", argv[4]);
- } else if (test_type == TEST_SHMEM) {
- shm_fd = memfd_create(argv[0], 0);
- if (shm_fd < 0)
+ if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
+ unsigned int memfd_flags = 0;
+
+ if (test_type == TEST_HUGETLB)
+ memfd_flags = MFD_HUGETLB;
+ mem_fd = memfd_create(argv[0], memfd_flags);
+ if (mem_fd < 0)
err("memfd_create");
- if (ftruncate(shm_fd, nr_pages * page_size * 2))
+ if (ftruncate(mem_fd, nr_pages * page_size * 2))
err("ftruncate");
- if (fallocate(shm_fd,
+ if (fallocate(mem_fd,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
nr_pages * page_size * 2))
err("fallocate");
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
index f11f8adda521..710571902743 100644
--- a/tools/testing/selftests/vm/vm_util.c
+++ b/tools/testing/selftests/vm/vm_util.c
@@ -28,6 +28,31 @@ bool pagemap_is_softdirty(int fd, char *start)
return entry & 0x0080000000000000ull;
}
+bool pagemap_is_swapped(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ return entry & 0x4000000000000000ull;
+}
+
+bool pagemap_is_populated(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* Present or swapped. */
+ return entry & 0xc000000000000000ull;
+}
+
+unsigned long pagemap_get_pfn(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* If present (63th bit), PFN is at bit 0 -- 54. */
+ if (entry & 0x8000000000000000ull)
+ return entry & 0x007fffffffffffffull;
+ return -1ull;
+}
+
void clear_softdirty(void)
{
int ret;
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
index 5c35de454e08..1995ee911ef2 100644
--- a/tools/testing/selftests/vm/vm_util.h
+++ b/tools/testing/selftests/vm/vm_util.h
@@ -4,6 +4,9 @@
uint64_t pagemap_get_entry(int fd, char *start);
bool pagemap_is_softdirty(int fd, char *start);
+bool pagemap_is_swapped(int fd, char *start);
+bool pagemap_is_populated(int fd, char *start);
+unsigned long pagemap_get_pfn(int fd, char *start);
void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void);