summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2022-12-02 14:38:54 +1100
committerStephen Rothwell <sfr@canb.auug.org.au>2022-12-02 14:38:54 +1100
commitc44c7c2acd597ad856856451bc01a3ee20bd8ca5 (patch)
treed80060447b377c64be6fb58b988d1abef34bca0e
parent2942033c43e9f27201db72b23d5149ed78afdb32 (diff)
parent124483111b85e345ef4c4d3e5e389c77949e747b (diff)
downloadlinux-c44c7c2acd597ad856856451bc01a3ee20bd8ca5.tar.gz
linux-c44c7c2acd597ad856856451bc01a3ee20bd8ca5.tar.xz
Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
# Conflicts: # mm/migrate.c
-rw-r--r--.mailmap2
-rw-r--r--arch/arm64/mm/mmu.c55
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/loongarch/include/asm/pgalloc.h13
-rw-r--r--arch/loongarch/include/asm/pgtable.h15
-rw-r--r--arch/loongarch/include/asm/sparsemem.h8
-rw-r--r--arch/loongarch/kernel/numa.c4
-rw-r--r--arch/loongarch/mm/init.c45
-rw-r--r--arch/loongarch/mm/pgtable.c23
-rw-r--r--arch/mips/include/asm/pgalloc.h10
-rw-r--r--arch/mips/include/asm/pgtable-64.h8
-rw-r--r--arch/mips/kvm/mmu.c3
-rw-r--r--arch/mips/mm/pgtable-32.c9
-rw-r--r--arch/mips/mm/pgtable-64.c18
-rw-r--r--arch/mips/mm/pgtable.c2
-rw-r--r--arch/nios2/include/asm/pgalloc.h5
-rw-r--r--arch/x86/mm/init_64.c92
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c7
-rw-r--r--fs/dax.c220
-rw-r--r--fs/fuse/dev.c55
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hugetlbfs/inode.c28
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/userfaultfd.c24
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iomap.c6
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_reflink.c8
-rw-r--r--include/linux/dax.h2
-rw-r--r--include/linux/eventfd.h2
-rw-r--r--include/linux/hugetlb.h71
-rw-r--r--include/linux/io-mapping.h4
-rw-r--r--include/linux/mm.h25
-rw-r--r--include/linux/mm_types.h8
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--include/linux/pagewalk.h5
-rw-r--r--include/linux/pgtable.h4
-rw-r--r--include/linux/rmap.h4
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/swapops.h6
-rw-r--r--include/trace/events/huge_memory.h38
-rw-r--r--kernel/kcsan/Makefile1
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/relay.c8
-rw-r--r--lib/maple_tree.c2
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c61
-rw-r--r--mm/folio-compat.c6
-rw-r--r--mm/gup.c55
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c498
-rw-r--r--mm/kasan/kasan.h12
-rw-r--r--mm/kasan/kasan_test.c4
-rw-r--r--mm/kasan/report.c53
-rw-r--r--mm/khugepaged.c18
-rw-r--r--mm/kmsan/instrumentation.c8
-rw-r--r--mm/ksm.c78
-rw-r--r--mm/madvise.c30
-rw-r--r--mm/memory.c9
-rw-r--r--mm/migrate.c34
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_vma_mapped.c7
-rw-r--r--mm/pagewalk.c33
-rw-r--r--mm/sparse-vmemmap.c73
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/userfaultfd.c6
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/z3fold.c36
-rw-r--r--mm/zbud.c32
-rw-r--r--mm/zpool.c10
-rw-r--r--mm/zsmalloc.c342
-rw-r--r--mm/zswap.c35
-rw-r--r--tools/testing/radix-tree/maple.c5
-rw-r--r--tools/testing/selftests/damon/Makefile2
-rw-r--r--tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh58
-rw-r--r--tools/testing/selftests/vm/Makefile2
-rw-r--r--tools/testing/selftests/vm/ksm_functional_tests.c279
-rw-r--r--tools/testing/selftests/vm/ksm_tests.c76
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests.sh209
-rw-r--r--tools/testing/selftests/vm/vm_util.c10
-rw-r--r--tools/testing/selftests/vm/vm_util.h1
83 files changed, 2014 insertions, 948 deletions
diff --git a/.mailmap b/.mailmap
index bbb6a077d227..514a5a6c264d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -228,6 +228,7 @@ Juha Yrjola <at solidboot.com>
Juha Yrjola <juha.yrjola@nokia.com>
Juha Yrjola <juha.yrjola@solidboot.com>
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
+Iskren Chernev <me@iskren.info> <iskren.chernev@gmail.com>
Kalle Valo <kvalo@kernel.org> <kvalo@codeaurora.org>
Kalyan Thota <quic_kalyant@quicinc.com> <kalyan_t@codeaurora.org>
Kay Sievers <kay.sievers@vrfy.org>
@@ -287,6 +288,7 @@ Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
Matthias Fuchs <socketcan@esd.eu> <matthias.fuchs@esd.eu>
Matthieu CASTET <castet.matthieu@free.fr>
+Matti Vaittinen <mazziesaccount@gmail.com> <matti.vaittinen@fi.rohmeurope.com>
Matt Ranostay <matt.ranostay@konsulko.com> <matt@ranostay.consulting>
Matt Ranostay <mranostay@gmail.com> Matthew Ranostay <mranostay@embeddedalley.com>
Matt Ranostay <mranostay@gmail.com> <matt.ranostay@intel.com>
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6f0ff8b70d16..12915f379c22 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1137,53 +1137,28 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
}
#endif
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
+ unsigned long addr, unsigned long next)
+{
+ vmemmap_verify((pte_t *)pmdp, node, addr, next);
+ return 1;
+}
+
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- unsigned long addr = start;
- unsigned long next;
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp;
- pmd_t *pmdp;
-
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
return vmemmap_populate_basepages(start, end, node, altmap);
-
- do {
- next = pmd_addr_end(addr, end);
-
- pgdp = vmemmap_pgd_populate(addr, node);
- if (!pgdp)
- return -ENOMEM;
-
- p4dp = vmemmap_p4d_populate(pgdp, addr, node);
- if (!p4dp)
- return -ENOMEM;
-
- pudp = vmemmap_pud_populate(p4dp, addr, node);
- if (!pudp)
- return -ENOMEM;
-
- pmdp = pmd_offset(pudp, addr);
- if (pmd_none(READ_ONCE(*pmdp))) {
- void *p = NULL;
-
- p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (!p) {
- if (vmemmap_populate_basepages(addr, next, node, altmap))
- return -ENOMEM;
- continue;
- }
-
- pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
- } else
- vmemmap_verify((pte_t *)pmdp, node, addr, next);
- } while (addr = next, addr != end);
-
- return 0;
+ else
+ return vmemmap_populate_hugepages(start, end, node, altmap);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ca26529a5a07..9cc8b84f7eb0 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -53,6 +53,7 @@ config LOONGARCH
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+ select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_NO_INSTR
select BUILDTIME_TABLE_SORT
@@ -501,6 +502,7 @@ config ARCH_FLATMEM_ENABLE
config ARCH_SPARSEMEM_ENABLE
def_bool y
+ select SPARSEMEM_VMEMMAP_ENABLE
help
Say Y to support efficient handling of sparse physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 4bfeb3c9c9ac..af1d1e4a6965 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -42,15 +42,6 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
extern void pagetable_init(void);
-/*
- * Initialize a new pmd table with invalid pointers.
- */
-extern void pmd_init(unsigned long page, unsigned long pagetable);
-
-/*
- * Initialize a new pgd / pmd table with invalid pointers.
- */
-extern void pgd_init(unsigned long page);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, address) \
@@ -76,7 +67,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
}
pmd = (pmd_t *)page_address(pg);
- pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table);
+ pmd_init(pmd);
return pmd;
}
@@ -92,7 +83,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
pud = (pud_t *) __get_free_page(GFP_KERNEL);
if (pud)
- pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table);
+ pud_init(pud);
return pud;
}
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 754ee51e1ba8..7a34e900d8c1 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -11,6 +11,7 @@
#include <linux/compiler.h>
#include <asm/addrspace.h>
+#include <asm/page.h>
#include <asm/pgtable-bits.h>
#if CONFIG_PGTABLE_LEVELS == 2
@@ -59,6 +60,7 @@
#include <linux/mm_types.h>
#include <linux/mmzone.h>
#include <asm/fixmap.h>
+#include <asm/sparsemem.h>
struct mm_struct;
struct vm_area_struct;
@@ -86,7 +88,10 @@ extern unsigned long zero_page_mask;
#define VMALLOC_START MODULES_END
#define VMALLOC_END \
(vm_map_base + \
- min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE)
+ min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE)
+
+#define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK))
+#define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1)
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
@@ -237,11 +242,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm
#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot))
/*
- * Initialize a new pgd / pmd table with invalid pointers.
+ * Initialize a new pgd / pud / pmd table with invalid pointers.
*/
-extern void pgd_init(unsigned long page);
-extern void pud_init(unsigned long page, unsigned long pagetable);
-extern void pmd_init(unsigned long page, unsigned long pagetable);
+extern void pgd_init(void *addr);
+extern void pud_init(void *addr);
+extern void pmd_init(void *addr);
/*
* Non-present pages: high 40 bits are offset, next 8 bits type,
diff --git a/arch/loongarch/include/asm/sparsemem.h b/arch/loongarch/include/asm/sparsemem.h
index 3d18cdf1b069..8d4af6aff8a8 100644
--- a/arch/loongarch/include/asm/sparsemem.h
+++ b/arch/loongarch/include/asm/sparsemem.h
@@ -11,8 +11,16 @@
#define SECTION_SIZE_BITS 29 /* 2^29 = Largest Huge Page Size */
#define MAX_PHYSMEM_BITS 48
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define VMEMMAP_SIZE (sizeof(struct page) * (1UL << (cpu_pabits + 1 - PAGE_SHIFT)))
+#endif
+
#endif /* CONFIG_SPARSEMEM */
+#ifndef VMEMMAP_SIZE
+#define VMEMMAP_SIZE 0 /* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */
+#endif
+
#ifdef CONFIG_MEMORY_HOTPLUG
int memory_add_physaddr_to_nid(u64 addr);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index a13f92593cfd..eb5d3a4c8a7a 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -78,7 +78,7 @@ void __init pcpu_populate_pte(unsigned long addr)
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
pgd_populate(&init_mm, pgd, new);
#ifndef __PAGETABLE_PUD_FOLDED
- pud_init((unsigned long)new, (unsigned long)invalid_pmd_table);
+ pud_init(new);
#endif
}
@@ -89,7 +89,7 @@ void __init pcpu_populate_pte(unsigned long addr)
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
pud_populate(&init_mm, pud, new);
#ifndef __PAGETABLE_PMD_FOLDED
- pmd_init((unsigned long)new, (unsigned long)invalid_pte_table);
+ pmd_init(new);
#endif
}
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 080061793c85..e018aed34586 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -22,7 +22,7 @@
#include <linux/pfn.h>
#include <linux/hardirq.h>
#include <linux/gfp.h>
-#include <linux/initrd.h>
+#include <linux/hugetlb.h>
#include <linux/mmzone.h>
#include <asm/asm-offsets.h>
@@ -152,6 +152,45 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+ pmd_t entry;
+
+ entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL);
+ pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL;
+ set_pmd_at(&init_mm, addr, pmd, entry);
+}
+
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ int huge = pmd_val(*pmd) & _PAGE_HUGE;
+
+ if (huge)
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+
+ return huge;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+#if CONFIG_PGTABLE_LEVELS == 2
+ return vmemmap_populate_basepages(start, end, node, NULL);
+#else
+ return vmemmap_populate_hugepages(start, end, node, NULL);
+#endif
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
+{
+}
+#endif
+#endif
+
static pte_t *fixmap_pte(unsigned long addr)
{
pgd_t *pgd;
@@ -168,7 +207,7 @@ static pte_t *fixmap_pte(unsigned long addr)
new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
pgd_populate(&init_mm, pgd, new);
#ifndef __PAGETABLE_PUD_FOLDED
- pud_init((unsigned long)new, (unsigned long)invalid_pmd_table);
+ pud_init(new);
#endif
}
@@ -179,7 +218,7 @@ static pte_t *fixmap_pte(unsigned long addr)
new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
pud_populate(&init_mm, pud, new);
#ifndef __PAGETABLE_PMD_FOLDED
- pmd_init((unsigned long)new, (unsigned long)invalid_pte_table);
+ pmd_init(new);
#endif
}
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index ee179ccd3e3f..36a6dc0148ae 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -16,7 +16,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
ret = (pgd_t *) __get_free_page(GFP_KERNEL);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
- pgd_init((unsigned long)ret);
+ pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
}
@@ -25,7 +25,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(pgd_alloc);
-void pgd_init(unsigned long page)
+void pgd_init(void *addr)
{
unsigned long *p, *end;
unsigned long entry;
@@ -38,7 +38,7 @@ void pgd_init(unsigned long page)
entry = (unsigned long)invalid_pte_table;
#endif
- p = (unsigned long *) page;
+ p = (unsigned long *)addr;
end = p + PTRS_PER_PGD;
do {
@@ -56,11 +56,12 @@ void pgd_init(unsigned long page)
EXPORT_SYMBOL_GPL(pgd_init);
#ifndef __PAGETABLE_PMD_FOLDED
-void pmd_init(unsigned long addr, unsigned long pagetable)
+void pmd_init(void *addr)
{
unsigned long *p, *end;
+ unsigned long pagetable = (unsigned long)invalid_pte_table;
- p = (unsigned long *) addr;
+ p = (unsigned long *)addr;
end = p + PTRS_PER_PMD;
do {
@@ -79,9 +80,10 @@ EXPORT_SYMBOL_GPL(pmd_init);
#endif
#ifndef __PAGETABLE_PUD_FOLDED
-void pud_init(unsigned long addr, unsigned long pagetable)
+void pud_init(void *addr)
{
unsigned long *p, *end;
+ unsigned long pagetable = (unsigned long)invalid_pmd_table;
p = (unsigned long *)addr;
end = p + PTRS_PER_PUD;
@@ -98,6 +100,7 @@ void pud_init(unsigned long addr, unsigned long pagetable)
p[-1] = pagetable;
} while (p != end);
}
+EXPORT_SYMBOL_GPL(pud_init);
#endif
pmd_t mk_pmd(struct page *page, pgprot_t prot)
@@ -119,12 +122,12 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
void __init pagetable_init(void)
{
/* Initialize the entire pgd. */
- pgd_init((unsigned long)swapper_pg_dir);
- pgd_init((unsigned long)invalid_pg_dir);
+ pgd_init(swapper_pg_dir);
+ pgd_init(invalid_pg_dir);
#ifndef __PAGETABLE_PUD_FOLDED
- pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table);
+ pud_init(invalid_pud_table);
#endif
#ifndef __PAGETABLE_PMD_FOLDED
- pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table);
+ pmd_init(invalid_pmd_table);
#endif
}
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index 796035784c73..f72e737dda21 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -33,7 +33,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
/*
* Initialize a new pmd table with invalid pointers.
*/
-extern void pmd_init(unsigned long page, unsigned long pagetable);
+extern void pmd_init(void *addr);
#ifndef __PAGETABLE_PMD_FOLDED
@@ -44,9 +44,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
#endif
/*
- * Initialize a new pgd / pmd table with invalid pointers.
+ * Initialize a new pgd table with invalid pointers.
*/
-extern void pgd_init(unsigned long page);
+extern void pgd_init(void *addr);
extern pgd_t *pgd_alloc(struct mm_struct *mm);
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
@@ -77,7 +77,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
}
pmd = (pmd_t *)page_address(pg);
- pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table);
+ pmd_init(pmd);
return pmd;
}
@@ -93,7 +93,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER);
if (pud)
- pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table);
+ pud_init(pud);
return pud;
}
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 436c29d698fa..c6310192b654 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -313,11 +313,11 @@ static inline pmd_t *pud_pgtable(pud_t pud)
#endif
/*
- * Initialize a new pgd / pmd table with invalid pointers.
+ * Initialize a new pgd / pud / pmd table with invalid pointers.
*/
-extern void pgd_init(unsigned long page);
-extern void pud_init(unsigned long page, unsigned long pagetable);
-extern void pmd_init(unsigned long page, unsigned long pagetable);
+extern void pgd_init(void *addr);
+extern void pud_init(void *addr);
+extern void pmd_init(void *addr);
/*
* Non-present pages: high 40 bits are offset, next 8 bits type,
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 74cd64a24d05..e8c08988ed37 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -122,8 +122,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
if (!cache)
return NULL;
new_pmd = kvm_mmu_memory_cache_alloc(cache);
- pmd_init((unsigned long)new_pmd,
- (unsigned long)invalid_pte_table);
+ pmd_init(new_pmd);
pud_populate(NULL, pud, new_pmd);
}
pmd = pmd_offset(pud, addr);
diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c
index 61891af25019..f57fb69472f8 100644
--- a/arch/mips/mm/pgtable-32.c
+++ b/arch/mips/mm/pgtable-32.c
@@ -13,9 +13,9 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
-void pgd_init(unsigned long page)
+void pgd_init(void *addr)
{
- unsigned long *p = (unsigned long *) page;
+ unsigned long *p = (unsigned long *)addr;
int i;
for (i = 0; i < USER_PTRS_PER_PGD; i+=8) {
@@ -61,9 +61,8 @@ void __init pagetable_init(void)
#endif
/* Initialize the entire pgd. */
- pgd_init((unsigned long)swapper_pg_dir);
- pgd_init((unsigned long)swapper_pg_dir
- + sizeof(pgd_t) * USER_PTRS_PER_PGD);
+ pgd_init(swapper_pg_dir);
+ pgd_init(&swapper_pg_dir[USER_PTRS_PER_PGD]);
pgd_base = swapper_pg_dir;
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index 7536f7804c44..b4386a0e2ef8 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -13,7 +13,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
-void pgd_init(unsigned long page)
+void pgd_init(void *addr)
{
unsigned long *p, *end;
unsigned long entry;
@@ -26,7 +26,7 @@ void pgd_init(unsigned long page)
entry = (unsigned long)invalid_pte_table;
#endif
- p = (unsigned long *) page;
+ p = (unsigned long *) addr;
end = p + PTRS_PER_PGD;
do {
@@ -43,11 +43,12 @@ void pgd_init(unsigned long page)
}
#ifndef __PAGETABLE_PMD_FOLDED
-void pmd_init(unsigned long addr, unsigned long pagetable)
+void pmd_init(void *addr)
{
unsigned long *p, *end;
+ unsigned long pagetable = (unsigned long)invalid_pte_table;
- p = (unsigned long *) addr;
+ p = (unsigned long *)addr;
end = p + PTRS_PER_PMD;
do {
@@ -66,9 +67,10 @@ EXPORT_SYMBOL_GPL(pmd_init);
#endif
#ifndef __PAGETABLE_PUD_FOLDED
-void pud_init(unsigned long addr, unsigned long pagetable)
+void pud_init(void *addr)
{
unsigned long *p, *end;
+ unsigned long pagetable = (unsigned long)invalid_pmd_table;
p = (unsigned long *)addr;
end = p + PTRS_PER_PUD;
@@ -108,12 +110,12 @@ void __init pagetable_init(void)
pgd_t *pgd_base;
/* Initialize the entire pgd. */
- pgd_init((unsigned long)swapper_pg_dir);
+ pgd_init(swapper_pg_dir);
#ifndef __PAGETABLE_PUD_FOLDED
- pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table);
+ pud_init(invalid_pud_table);
#endif
#ifndef __PAGETABLE_PMD_FOLDED
- pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table);
+ pmd_init(invalid_pmd_table);
#endif
pgd_base = swapper_pg_dir;
/*
diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c
index 3b7590660a04..b13314be5d0e 100644
--- a/arch/mips/mm/pgtable.c
+++ b/arch/mips/mm/pgtable.c
@@ -15,7 +15,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER);
if (ret) {
init = pgd_offset(&init_mm, 0UL);
- pgd_init((unsigned long)ret);
+ pgd_init(ret);
memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
}
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 3c4ae74d5798..ecd1657bb2ce 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -26,11 +26,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd((unsigned long)page_address(pte)));
}
-/*
- * Initialize a new pmd table with invalid pointers.
- */
-extern void pmd_init(unsigned long page, unsigned long pagetable);
-
extern pgd_t *pgd_alloc(struct mm_struct *mm);
#define __pte_free_tlb(tlb, pte, addr) \
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e8db4edd7cc9..a190aae8ceaf 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1492,72 +1492,44 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-static int __meminit vmemmap_populate_hugepages(unsigned long start,
- unsigned long end, int node, struct vmem_altmap *altmap)
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
{
- unsigned long addr;
- unsigned long next;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- for (addr = start; addr < end; addr = next) {
- next = pmd_addr_end(addr, end);
-
- pgd = vmemmap_pgd_populate(addr, node);
- if (!pgd)
- return -ENOMEM;
-
- p4d = vmemmap_p4d_populate(pgd, addr, node);
- if (!p4d)
- return -ENOMEM;
-
- pud = vmemmap_pud_populate(p4d, addr, node);
- if (!pud)
- return -ENOMEM;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- void *p;
-
- p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (p) {
- pte_t entry;
-
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd(pte_val(entry)));
+ pte_t entry;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
- /* check to see if we have contiguous blocks */
- if (p_end != p || node_start != node) {
- if (p_start)
- pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- addr_start, addr_end-1, p_start, p_end-1, node_start);
- addr_start = addr;
- node_start = node;
- p_start = p;
- }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
- addr_end = addr + PMD_SIZE;
- p_end = p + PMD_SIZE;
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+}
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE))
- vmemmap_use_new_sub_pmd(addr, next);
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmd);
- continue;
- } else if (altmap)
- return -ENOMEM; /* no fallback */
- } else if (pmd_large(*pmd)) {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
- vmemmap_use_sub_pmd(addr, next);
- continue;
- }
- if (vmemmap_populate_basepages(addr, next, node, NULL))
- return -ENOMEM;
+ if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
}
- return 0;
+
+ return large;
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 3cc83997a1f8..fecf523f36d8 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -1904,10 +1904,6 @@ static int mport_cdev_open(struct inode *inode, struct file *filp)
priv->md = chdev;
- mutex_lock(&chdev->file_mutex);
- list_add_tail(&priv->list, &chdev->file_list);
- mutex_unlock(&chdev->file_mutex);
-
INIT_LIST_HEAD(&priv->db_filters);
INIT_LIST_HEAD(&priv->pw_filters);
spin_lock_init(&priv->fifo_lock);
@@ -1926,6 +1922,9 @@ static int mport_cdev_open(struct inode *inode, struct file *filp)
spin_lock_init(&priv->req_lock);
mutex_init(&priv->dma_lock);
#endif
+ mutex_lock(&chdev->file_mutex);
+ list_add_tail(&priv->list, &chdev->file_list);
+ mutex_unlock(&chdev->file_mutex);
filp->private_data = priv;
goto out;
diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..a57e320e7971 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
+static inline bool dax_mapping_is_shared(struct page *page)
{
- return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+ return (unsigned long)page->mapping == PAGE_MAPPING_DAX_SHARED;
}
/*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
+ * refcount.
*/
-static inline void dax_mapping_set_cow(struct page *page)
+static inline void dax_mapping_set_shared(struct page *page)
{
- if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_SHARED) {
/*
* Reset the index if the page was already mapped
* regularly before.
*/
if (page->mapping)
- page->index = 1;
- page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ page->share = 1;
+ page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;
}
- page->index++;
+ page->share++;
+}
+
+static inline unsigned long dax_mapping_decrease_shared(struct page *page)
+{
+ return --page->share;
}
/*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * When it is called in dax_insert_entry(), the shared flag will indicate that
* whether this entry is shared by multiple files. If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool cow)
+ struct vm_area_struct *vma, unsigned long address, bool shared)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- if (cow) {
- dax_mapping_set_cow(page);
+ if (shared) {
+ dax_mapping_set_shared(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
@@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_mapping_is_cow(page->mapping)) {
- /* keep the CoW flag if this page is still shared */
- if (page->index-- > 0)
+ if (dax_mapping_is_shared(page)) {
+ /* keep the shared flag if this page is still shared */
+ if (dax_mapping_decrease_shared(page) > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
@@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
(iter->iomap.flags & IOMAP_F_DIRTY);
}
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
- return (iter->flags & IOMAP_WRITE) &&
- (iter->iomap.flags & IOMAP_F_SHARED);
-}
-
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
- bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
- bool cow = dax_fault_is_cow(iter);
+ bool write = iter->flags & IOMAP_WRITE;
+ bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
+ bool shared = iter->iomap.flags & IOMAP_F_SHARED;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+ if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
xas_reset(xas);
xas_lock_irq(xas);
- if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- cow);
+ shared);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- if (cow)
+ if (write && shared)
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irq(xas);
@@ -1086,7 +1087,7 @@ out:
}
/**
- * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * dax_iomap_copy_around - Copy the data from source to destination before write
* @pos: address to do copy from.
* @length: size of copy operation.
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
@@ -1095,35 +1096,50 @@ out:
*
* This can be called from two places. Either during DAX write fault (page
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
- * write operation, dax_iomap_actor() might call this to do the copy of either
+ * write operation, dax_iomap_iter() might call this to do the copy of either
* start or end unaligned address. In the latter case the rest of the copy of
- * aligned ranges is taken care by dax_iomap_actor() itself.
+ * aligned ranges is taken care by dax_iomap_iter() itself.
+ * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
+ * area to make sure no old data remains.
*/
-static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
const struct iomap *srcmap, void *daddr)
{
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size);
+ /* copy_all is usually in page fault case */
bool copy_all = head_off == 0 && end == pg_end;
+ /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
+ bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
+ srcmap->type == IOMAP_UNWRITTEN;
void *saddr = 0;
int ret = 0;
- ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
- if (ret)
- return ret;
+ if (!zero_edge) {
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+ }
if (copy_all) {
- ret = copy_mc_to_kernel(daddr, saddr, length);
- return ret ? -EIO : 0;
+ if (zero_edge)
+ memset(daddr, 0, size);
+ else
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ goto out;
}
/* Copy the head part of the range */
if (head_off) {
- ret = copy_mc_to_kernel(daddr, saddr, head_off);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr, 0, head_off);
+ else {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
}
/* Copy the tail part of the range */
@@ -1131,12 +1147,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
- ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
- tail_len);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr + tail_off, 0, tail_len);
+ else {
+ ret = copy_mc_to_kernel(daddr + tail_off,
+ saddr + tail_off, tail_len);
+ if (ret)
+ return -EIO;
+ }
}
- return 0;
+out:
+ if (zero_edge)
+ dax_flush(srcmap->dax_dev, daddr, size);
+ return ret ? -EIO : 0;
}
/*
@@ -1221,6 +1244,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
+static s64 dax_unshare_iter(struct iomap_iter *iter)
+{
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
+ int id = 0;
+ s64 ret = 0;
+ void *daddr = NULL, *saddr = NULL;
+
+ /* don't bother with blocks that are not shared to start with */
+ if (!(iomap->flags & IOMAP_F_SHARED))
+ return length;
+ /* don't bother with holes or unwritten extents */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ if (ret)
+ ret = -EIO;
+
+out_unlock:
+ dax_read_unlock(id);
+ return ret;
+}
+
+int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_unshare_iter(&iter);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_file_unshare);
+
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
const struct iomap *iomap = &iter->iomap;
@@ -1235,13 +1310,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
if (ret < 0)
return ret;
memset(kaddr + offset, 0, size);
- if (srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
- kaddr);
- if (ret < 0)
- return ret;
- dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
- } else
+ if (iomap->flags & IOMAP_F_SHARED)
+ ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ else
dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@@ -1258,6 +1330,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
+ /*
+ * invalidate the pages whose sharing state is to be changed
+ * because of CoW.
+ */
+ if (iomap->flags & IOMAP_F_SHARED)
+ invalidate_inode_pages2_range(iter->inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + length - 1) >> PAGE_SHIFT);
+
do {
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
@@ -1318,12 +1399,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
- const struct iomap *srcmap = &iomi->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iomi);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
bool write = iov_iter_rw(iter) == WRITE;
+ bool cow = write && iomap->flags & IOMAP_F_SHARED;
ssize_t ret = 0;
size_t xfer;
int id;
@@ -1350,7 +1432,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if (iomap->flags & IOMAP_F_NEW) {
+ if (iomap->flags & IOMAP_F_NEW || cow) {
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1384,10 +1466,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
- kaddr);
+ if (cow) {
+ ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
+ srcmap, kaddr);
if (ret)
break;
}
@@ -1532,7 +1613,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = &iter->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
@@ -1563,9 +1644,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (write && iomap->flags & IOMAP_F_SHARED) {
+ err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
if (err)
return dax_fault_return(err);
}
@@ -1936,15 +2016,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret;
+ int ret, compared = 0;
- while ((ret = iomap_iter(&src_iter, ops)) > 0) {
- while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
- dst_iter.processed = dax_range_compare_iter(&src_iter,
- &dst_iter, len, same);
- }
- if (ret <= 0)
- src_iter.processed = ret;
+ while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
+ (ret = iomap_iter(&dst_iter, ops)) > 0) {
+ compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
+ same);
+ if (compared < 0)
+ return ret;
+ src_iter.processed = dst_iter.processed = compared;
}
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c77eea4f636b..9f349f834977 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -766,11 +766,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
return ncpy;
}
-static int fuse_check_page(struct page *page)
+static int fuse_check_folio(struct folio *folio)
{
- if (page_mapcount(page) ||
- page->mapping != NULL ||
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ if (folio_mapped(folio) ||
+ folio->mapping != NULL ||
+ (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
@@ -780,7 +780,7 @@ static int fuse_check_page(struct page *page)
1 << PG_reclaim |
1 << PG_waiters |
LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
+ dump_page(&folio->page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -789,11 +789,11 @@ static int fuse_check_page(struct page *page)
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
int err;
- struct page *oldpage = *pagep;
- struct page *newpage;
+ struct folio *oldfolio = page_folio(*pagep);
+ struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
- get_page(oldpage);
+ folio_get(oldfolio);
err = unlock_request(cs->req);
if (err)
goto out_put_old;
@@ -816,35 +816,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (!pipe_buf_try_steal(cs->pipe, buf))
goto out_fallback;
- newpage = buf->page;
+ newfolio = page_folio(buf->page);
- if (!PageUptodate(newpage))
- SetPageUptodate(newpage);
+ if (!folio_test_uptodate(newfolio))
+ folio_mark_uptodate(newfolio);
- ClearPageMappedToDisk(newpage);
+ folio_clear_mappedtodisk(newfolio);
- if (fuse_check_page(newpage) != 0)
+ if (fuse_check_folio(newfolio) != 0)
goto out_fallback_unlock;
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
*/
- if (WARN_ON(page_mapped(oldpage)))
+ if (WARN_ON(folio_mapped(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(page_has_private(oldpage)))
+ if (WARN_ON(folio_has_private(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ if (WARN_ON(folio_test_dirty(oldfolio) ||
+ folio_test_writeback(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageMlocked(oldpage)))
+ if (WARN_ON(folio_test_mlocked(oldfolio)))
goto out_fallback_unlock;
- replace_page_cache_page(oldpage, newpage);
+ replace_page_cache_folio(oldfolio, newfolio);
- get_page(newpage);
+ folio_get(newfolio);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add(newpage);
+ folio_add_lru(newfolio);
/*
* Release while we have extra ref on stolen page. Otherwise
@@ -857,28 +858,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = newpage;
+ *pagep = &newfolio->page;
spin_unlock(&cs->req->waitq.lock);
if (err) {
- unlock_page(newpage);
- put_page(newpage);
+ folio_unlock(newfolio);
+ folio_put(newfolio);
goto out_put_old;
}
- unlock_page(oldpage);
+ folio_unlock(oldfolio);
/* Drop ref for ap->pages[] array */
- put_page(oldpage);
+ folio_put(oldfolio);
cs->len = 0;
err = 0;
out_put_old:
/* Drop ref obtained in this function */
- put_page(oldpage);
+ folio_put(oldfolio);
return err;
out_fallback_unlock:
- unlock_page(newpage);
+ folio_unlock(newfolio);
out_fallback:
cs->pg = buf->page;
cs->offset = buf->offset;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..a0746be3c1de 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ return -EIO;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
/* panic? */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 790d2727141a..48f1a8ad2243 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
{
pte_t *ptep, pte;
- ptep = huge_pte_offset(vma->vm_mm, addr,
- huge_page_size(hstate_vma(vma)));
-
+ ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
if (!ptep)
return false;
@@ -412,10 +410,12 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
*/
static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
{
+ unsigned long offset = 0;
+
if (vma->vm_pgoff < start)
- return (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- return 0;
+ offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+
+ return vma->vm_start + offset;
}
static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
@@ -457,7 +457,7 @@ retry:
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ if (!hugetlb_vma_maps_page(vma, v_start, page))
continue;
if (!hugetlb_vma_trylock_write(vma)) {
@@ -473,8 +473,8 @@ retry:
break;
}
- unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
- NULL, ZAP_FLAG_DROP_MARKER);
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
hugetlb_vma_unlock_write(vma);
}
@@ -507,10 +507,9 @@ retry:
*/
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
- unmap_hugepage_range(vma, vma->vm_start + v_start,
- v_end, NULL,
- ZAP_FLAG_DROP_MARKER);
+ if (hugetlb_vma_maps_page(vma, v_start, page))
+ unmap_hugepage_range(vma, v_start, v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
hugetlb_vma_unlock_write(vma);
@@ -540,8 +539,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
- NULL, zap_flags);
+ unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);
/*
* Note that vma lock only exists for shared/non-private
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..a503c553bab2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
- if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
- nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 98ac37e34e3d..9ea8555a19bc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -237,14 +237,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
unsigned long flags,
unsigned long reason)
{
- struct mm_struct *mm = ctx->mm;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(mm);
-
- ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
+ mmap_assert_locked(ctx->mm);
+ ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -376,7 +374,8 @@ static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
*/
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
- struct mm_struct *mm = vmf->vma->vm_mm;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -403,7 +402,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*/
mmap_assert_locked(mm);
- ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -493,6 +492,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
blocking_state = userfaultfd_get_blocking_state(vmf->flags);
+ /*
+ * This stablizes pgtable for hugetlb on e.g. pmd unsharing. Need
+ * to be before setting current state.
+ */
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_lock_read(vma);
+
spin_lock_irq(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
@@ -507,13 +513,15 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
- if (!is_vm_hugetlb_page(vmf->vma))
+ if (!is_vm_hugetlb_page(vma))
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+ must_wait = userfaultfd_huge_must_wait(ctx, vma,
vmf->address,
vmf->flags, reason);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_unlock_read(vma);
mmap_read_unlock(mm);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f783e979629..13f1b2add390 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags(
if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
- /* Don't allow us to set DAX mode for a reflinked file for now. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
- return -EINVAL;
-
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (i_flags2 && !xfs_has_v3inodes(mp))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 68436370927d..8ac27bbda882 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1253,7 +1253,7 @@ xfs_read_iomap_begin(
return error;
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
- if (!error && (flags & IOMAP_REPORT))
+ if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
xfs_iunlock(ip, lockmode);
@@ -1415,7 +1415,7 @@ xfs_zero_range(
if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
}
@@ -1430,7 +1430,7 @@ xfs_truncate_page(
if (IS_DAX(inode))
return dax_truncate_page(inode, pos, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 712238305bc3..515318dfbc38 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1187,10 +1187,6 @@ xfs_inode_supports_dax(
if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* Only supported on non-reflinked files. */
- if (xfs_is_reflink_inode(ip))
- return false;
-
/* Block size must match page size */
if (mp->m_sb.sb_blocksize != PAGE_SIZE)
return false;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 93bdd25680bc..fe46bce8cae6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1693,8 +1693,12 @@ xfs_reflink_unshare(
inode_dio_wait(inode);
- error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ if (IS_DAX(inode))
+ error = dax_file_unshare(inode, offset, len,
+ &xfs_dax_write_iomap_ops);
+ else
+ error = iomap_file_unshare(inode, offset, len,
+ &xfs_buffered_write_iomap_ops);
if (error)
goto out;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index ba985333e26b..2b5ecb591059 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping,
}
#endif
+int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+ const struct iomap_ops *ops);
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops);
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 786824f58d3d..36a486505b08 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -62,7 +62,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
return ERR_PTR(-ENOSYS);
}
-static inline int eventfd_signal(struct eventfd_ctx *ctx, int n)
+static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
return -ENOSYS;
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 551834cd5299..1c20cbbf3d22 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H
+#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
@@ -192,6 +193,43 @@ extern struct list_head huge_boot_pages;
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
+/*
+ * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
+ * Returns the pte_t* if found, or NULL if the address is not mapped.
+ *
+ * IMPORTANT: we should normally not directly call this function, instead
+ * this is only a common interface to implement arch-specific walker.
+ * Please consider using the hugetlb_walk() helper to make sure of the
+ * correct locking is satisfied.
+ *
+ * Since this function will walk all the pgtable pages (including not only
+ * high-level pgtable page, but also PUD entry that can be unshared
+ * concurrently for VM_SHARED), the caller of this function should be
+ * responsible of its thread safety. One can follow this rule:
+ *
+ * (1) For private mappings: pmd unsharing is not possible, so it'll
+ * always be safe if we're with the mmap sem for either read or write.
+ * This is normally always the case, IOW we don't need to do anything
+ * special.
+ *
+ * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
+ * pgtable page can go away from under us! It can be done by a pmd
+ * unshare with a follow up munmap() on the other process), then we
+ * need either:
+ *
+ * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
+ * won't happen upon the range (it also makes sure the pte_t we
+ * read is the right and stable one), or,
+ *
+ * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
+ * sure even if unshare happened the racy unmap() will wait until
+ * i_mmap_rwsem is released.
+ *
+ * Option (2.1) is the safest, which guarantees pte stability from pmd
+ * sharing pov, until the vma lock released. Option (2.2) doesn't protect
+ * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
+ * access.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
@@ -1197,4 +1235,37 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
#endif
+static inline bool
+__vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+/*
+ * Safe version of huge_pte_offset() to check the locks. See comments
+ * above huge_pte_offset().
+ */
+static inline pte_t *
+hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && \
+ defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ /*
+ * If pmd sharing possible, locking needed to safely walk the
+ * hugetlb pgtables. More information can be found at the comment
+ * above huge_pte_offset() in the same file.
+ *
+ * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
+ */
+ if (__vma_shareable_flags_pmd(vma))
+ WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
+ !lockdep_is_held(
+ &vma->vm_file->f_mapping->i_mmap_rwsem));
+#endif
+ return huge_pte_offset(vma->vm_mm, addr, sz);
+}
+
#endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 66a774d2710e..09d4f17c8d3b 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -213,7 +213,7 @@ io_mapping_free(struct io_mapping *iomap)
kfree(iomap);
}
-#endif /* _LINUX_IO_MAPPING_H */
-
int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
unsigned long addr, unsigned long pfn, unsigned long size);
+
+#endif /* _LINUX_IO_MAPPING_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9651c39d1287..517225fce10f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -986,6 +986,13 @@ static inline void set_compound_page_dtor(struct page *page,
page[1].compound_dtor = compound_dtor;
}
+static inline void folio_set_compound_dtor(struct folio *folio,
+ enum compound_dtor_id compound_dtor)
+{
+ VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
+ folio->_folio_dtor = compound_dtor;
+}
+
void destroy_large_folio(struct folio *folio);
static inline int head_compound_pincount(struct page *head)
@@ -1001,6 +1008,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
#endif
}
+static inline void folio_set_compound_order(struct folio *folio,
+ unsigned int order)
+{
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
@@ -3046,7 +3062,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
* and return without waiting upon it */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_ANON 0x8000 /* don't do file mappings */
@@ -3352,6 +3367,8 @@ void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
+void pmd_init(void *addr);
+void pud_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
@@ -3363,8 +3380,14 @@ struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
+void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next);
+int vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
+int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 018b1c098173..3b8475007734 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -104,7 +104,10 @@ struct page {
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
- pgoff_t index; /* Our offset within mapping. */
+ union {
+ pgoff_t index; /* Our offset within mapping. */
+ unsigned long share; /* share count for fsdax */
+ };
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
@@ -929,7 +932,6 @@ typedef __bitwise unsigned int vm_fault_t;
* @VM_FAULT_OOM: Out Of Memory
* @VM_FAULT_SIGBUS: Bad access
* @VM_FAULT_MAJOR: Page read from storage
- * @VM_FAULT_WRITE: Special case for get_user_pages
* @VM_FAULT_HWPOISON: Hit poisoned small page
* @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
* in upper bits
@@ -950,7 +952,6 @@ enum vm_fault_reason {
VM_FAULT_OOM = (__force vm_fault_t)0x000001,
VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
- VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
@@ -976,7 +977,6 @@ enum vm_fault_reason {
{ VM_FAULT_OOM, "OOM" }, \
{ VM_FAULT_SIGBUS, "SIGBUS" }, \
{ VM_FAULT_MAJOR, "MAJOR" }, \
- { VM_FAULT_WRITE, "WRITE" }, \
{ VM_FAULT_HWPOISON, "HWPOISON" }, \
{ VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
{ VM_FAULT_SIGSEGV, "SIGSEGV" }, \
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f235ec442b61..480626efd786 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -639,7 +639,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
* Different with flags above, this flag is used only for fsdax mode. It
* indicates that this page->mapping is now under reflink case.
*/
-#define PAGE_MAPPING_DAX_COW 0x1
+#define PAGE_MAPPING_DAX_SHARED 0x1
static __always_inline bool folio_mapping_flags(struct folio *folio)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2ec0ca1f3d38..29e1f9e76eb6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1102,7 +1102,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
-void replace_page_cache_page(struct page *old, struct page *new);
+void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index f3fafb731ffd..959f52e5867d 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -27,6 +27,8 @@ struct mm_walk;
* "do page table walk over the current vma", returning
* a negative value means "abort current page table walk
* right now" and returning 1 means "skip the current vma"
+ * Note that this callback is not called when the caller
+ * passes in a single VMA as for walk_page_vma().
* @pre_vma: if set, called before starting walk on a non-null vma.
* @post_vma: if set, called after a walk on a non-null vma, provided
* that @pre_vma and the vma walk succeeded.
@@ -99,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd,
void *private);
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private);
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 576093de8b1c..1159b25b0542 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -456,9 +456,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
unsigned long address, pte_t *ptep,
int full)
{
- pte_t pte;
- pte = ptep_get_and_clear(mm, address, ptep);
- return pte;
+ return ptep_get_and_clear(mm, address, ptep);
}
#endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bd3504d11b15..a50d18bb86aa 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/hugetlb.h>
/*
* The anon_vma heads a list of private "related" vmas, to scan if
@@ -408,6 +409,9 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
pte_unmap(pvmw->pte);
if (pvmw->ptl)
spin_unlock(pvmw->ptl);
+ /* This needs to be after unlock of the spinlock */
+ if (is_vm_hugetlb_page(pvmw->vma))
+ hugetlb_vma_unlock_read(pvmw->vma);
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b61e2007d156..0ceed49516ad 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -381,7 +381,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
-void lru_cache_add(struct page *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 27ade4f22abb..09b22b169a71 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -335,7 +335,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE
-extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#endif /* CONFIG_HUGETLB_PAGE */
#else /* CONFIG_MIGRATION */
@@ -364,7 +365,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
#ifdef CONFIG_HUGETLB_PAGE
-static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 760455dfa860..35d759d3b010 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -203,5 +203,43 @@ TRACE_EVENT(mm_khugepaged_scan_file,
__print_symbolic(__entry->result, SCAN_STATUS))
);
+TRACE_EVENT(mm_khugepaged_collapse_file,
+ TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index,
+ bool is_shmem, unsigned long addr, struct file *file,
+ int nr, int result),
+ TP_ARGS(mm, hpage, index, is_shmem, addr, file, nr, result),
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, hpfn)
+ __field(pgoff_t, index)
+ __field(bool, is_shmem)
+ __field(unsigned long, addr)
+ __string(filename, file->f_path.dentry->d_iname)
+ __field(int, nr)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->hpfn = hpage ? page_to_pfn(hpage) : -1;
+ __entry->index = index;
+ __entry->is_shmem = is_shmem;
+ __entry->addr = addr;
+ __assign_str(filename, file->f_path.dentry->d_iname);
+ __entry->nr = nr;
+ __entry->result = result;
+ ),
+
+ TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, is_shmem=%d, addr=%ld, filename=%s, nr=%d, result=%s",
+ __entry->mm,
+ __entry->hpfn,
+ __entry->index,
+ __entry->is_shmem,
+ __entry->addr,
+ __get_str(filename),
+ __entry->nr,
+ __print_symbolic(__entry->result, SCAN_STATUS))
+);
+
#endif /* __HUGE_MEMORY_H */
#include <trace/define_trace.h>
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 4f35d1bced6a..8cf70f068d92 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index ea925731fa40..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -7,7 +7,6 @@ obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
-KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/relay.c b/kernel/relay.c
index d7edc934c56d..ef12532168d9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -148,13 +148,13 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *),
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -507,7 +507,7 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
chan->cb = cb;
kref_init(&chan->kref);
@@ -578,7 +578,7 @@ int relay_late_setup_files(struct rchan *chan,
if (!chan || !base_filename)
return -EINVAL;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 3fe1491d2bf9..fe3947b80069 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -6062,7 +6062,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min)
if (mas->index < min)
return NULL;
- /* Retries on dead nodes handled by mas_next_entry */
+ /* Retries on dead nodes handled by mas_prev_entry */
return mas_prev_entry(mas, min);
}
EXPORT_SYMBOL_GPL(mas_find_rev);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index c76ee665355a..bf04fec87f35 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -72,7 +72,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
endbyte = (u64)offset + (u64)len;
if (!len || endbyte < len)
- endbyte = -1;
+ endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
diff --git a/mm/filemap.c b/mm/filemap.c
index 65eee6ec1066..c4d4ace9cc70 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -506,9 +506,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
struct pagevec pvec;
int nr_pages;
- if (end_byte < start_byte)
- return;
-
pagevec_init(&pvec);
while (index <= end) {
unsigned i;
@@ -670,6 +667,9 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0, err2;
+ if (lend < lstart)
+ return 0;
+
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -770,6 +770,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
+ if (lend < lstart)
+ return 0;
+
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
@@ -785,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(file_write_and_wait_range);
/**
- * replace_page_cache_page - replace a pagecache page with a new one
- * @old: page to be replaced
- * @new: page to replace with
- *
- * This function replaces a page in the pagecache with a new one. On
- * success it acquires the pagecache reference for the new page and
- * drops it for the old page. Both the old and new pages must be
- * locked. This function does not add the new page to the LRU, the
+ * replace_page_cache_folio - replace a pagecache folio with a new one
+ * @old: folio to be replaced
+ * @new: folio to replace with
+ *
+ * This function replaces a folio in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new folio and
+ * drops it for the old folio. Both the old and new folios must be
+ * locked. This function does not add the new folio to the LRU, the
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
*/
-void replace_page_cache_page(struct page *old, struct page *new)
+void replace_page_cache_folio(struct folio *old, struct folio *new)
{
- struct folio *fold = page_folio(old);
- struct folio *fnew = page_folio(new);
struct address_space *mapping = old->mapping;
void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
- VM_BUG_ON_PAGE(!PageLocked(old), old);
- VM_BUG_ON_PAGE(!PageLocked(new), new);
- VM_BUG_ON_PAGE(new->mapping, new);
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(new->mapping, new);
- get_page(new);
+ folio_get(new);
new->mapping = mapping;
new->index = offset;
- mem_cgroup_migrate(fold, fnew);
+ mem_cgroup_migrate(old, new);
xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
- if (!PageHuge(old))
- __dec_lruvec_page_state(old, NR_FILE_PAGES);
- if (!PageHuge(new))
- __inc_lruvec_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(old))
- __dec_lruvec_page_state(old, NR_SHMEM);
- if (PageSwapBacked(new))
- __inc_lruvec_page_state(new, NR_SHMEM);
+ if (!folio_test_hugetlb(old))
+ __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
+ if (!folio_test_hugetlb(new))
+ __lruvec_stat_add_folio(new, NR_FILE_PAGES);
+ if (folio_test_swapbacked(old))
+ __lruvec_stat_sub_folio(old, NR_SHMEM);
+ if (folio_test_swapbacked(new))
+ __lruvec_stat_add_folio(new, NR_SHMEM);
xas_unlock_irq(&xas);
if (free_folio)
- free_folio(fold);
- folio_put(fold);
+ free_folio(old);
+ folio_put(old);
}
-EXPORT_SYMBOL_GPL(replace_page_cache_page);
+EXPORT_SYMBOL_GPL(replace_page_cache_folio);
noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 86933fa8f3e1..69ed25790c68 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -76,12 +76,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc,
}
EXPORT_SYMBOL(redirty_page_for_writepage);
-void lru_cache_add(struct page *page)
-{
- folio_add_lru(page_folio(page));
-}
-EXPORT_SYMBOL(lru_cache_add);
-
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
diff --git a/mm/gup.c b/mm/gup.c
index 72e58deda7a4..c2c2c6dbcfcf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -545,30 +545,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
-retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
+ if (!pte_present(pte))
+ goto no_page;
if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto no_page;
@@ -678,28 +661,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
-retry:
- if (!pmd_present(pmdval)) {
- /*
- * Should never reach here, if thp migration is not supported;
- * Otherwise, it must be a thp migration entry.
- */
- VM_BUG_ON(!thp_migration_supported() ||
- !is_pmd_migration_entry(pmdval));
-
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
-
- pmd_migration_entry_wait(mm, pmd);
- pmdval = READ_ONCE(*pmd);
- /*
- * MADV_DONTNEED may convert the pmd to null because
- * mmap_lock is held in read mode
- */
- if (pmd_none(pmdval))
- return no_page_table(vma, flags);
- goto retry;
- }
+ if (!pmd_present(pmdval))
+ return no_page_table(vma, flags);
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
@@ -713,18 +676,10 @@ retry:
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
return no_page_table(vma, flags);
-retry_locked:
ptl = pmd_lock(mm, pmd);
- if (unlikely(pmd_none(*pmd))) {
- spin_unlock(ptl);
- return no_page_table(vma, flags);
- }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
- if (likely(!(flags & FOLL_MIGRATION)))
- return no_page_table(vma, flags);
- pmd_migration_entry_wait(mm, pmd);
- goto retry_locked;
+ return no_page_table(vma, flags);
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7651420e2d8f..abe6cfd92ffa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1381,7 +1381,7 @@ reuse:
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- return VM_FAULT_WRITE;
+ return 0;
}
unlock_fallback:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be0b2953de42..084efa3cf5e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -54,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-static bool hugetlb_cma_page(struct page *page, unsigned int order)
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
- return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+ return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
1 << order);
}
#else
-static bool hugetlb_cma_page(struct page *page, unsigned int order)
+static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
{
return false;
}
@@ -1127,17 +1127,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}
-static void enqueue_huge_page(struct hstate *h, struct page *page)
+static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
lockdep_assert_held(&hugetlb_lock);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
- list_move(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&folio->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetHPageFreed(page);
+ folio_set_hugetlb_freed(folio);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1325,77 +1325,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
nr_nodes--)
/* used to demote non-gigantic_huge pages as well */
-static void __destroy_compound_gigantic_page(struct page *page,
+static void __destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
struct page *p;
- atomic_set(compound_mapcount_ptr(page), 0);
- atomic_set(subpages_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), 0);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
for (i = 1; i < nr_pages; i++) {
- p = nth_page(page, i);
+ p = folio_page(folio, i);
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
set_page_refcounted(p);
}
- set_compound_order(page, 0);
-#ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
-#endif
- __ClearPageHead(page);
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
}
-static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, true);
+ __destroy_compound_gigantic_folio(folio, order, true);
}
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
+static void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order)
{
- __destroy_compound_gigantic_page(page, order, false);
+ __destroy_compound_gigantic_folio(folio, order, false);
}
-static void free_gigantic_page(struct page *page, unsigned int order)
+static void free_gigantic_folio(struct folio *folio, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
- if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ int nid = folio_nid(folio);
+
+ if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
return;
#endif
- free_contig_range(page_to_pfn(page), 1 << order);
+ free_contig_range(folio_pfn(folio), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
+ struct page *page;
unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
- struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
if (!(gfp_mask & __GFP_THISNODE)) {
@@ -1406,17 +1405,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
if (page)
- return page;
+ return page_folio(page);
}
}
}
#endif
- return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
+ return page ? page_folio(page) : NULL;
}
#else /* !CONFIG_CONTIG_ALLOC */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
@@ -1424,30 +1424,30 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
-static inline void free_gigantic_page(struct page *page, unsigned int order) { }
-static inline void destroy_compound_gigantic_page(struct page *page,
+static inline void free_gigantic_folio(struct folio *folio,
+ unsigned int order) { }
+static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
/*
- * Remove hugetlb page from lists, and update dtor so that page appears
+ * Remove hugetlb folio from lists, and update dtor so that the folio appears
* as just a compound page.
*
- * A reference is held on the page, except in the case of demote.
+ * A reference is held on the folio, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
-static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus,
bool demote)
{
- int nid = page_to_nid(page);
- struct folio *folio = page_folio(page);
+ int nid = folio_nid(folio);
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
@@ -1456,9 +1456,9 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page,
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- list_del(&page->lru);
+ list_del(&folio->lru);
- if (HPageFreed(page)) {
+ if (folio_test_hugetlb_freed(folio)) {
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
}
@@ -1477,50 +1477,50 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page,
*
* For gigantic pages set the destructor to the null dtor. This
* destructor will never be called. Before freeing the gigantic
- * page destroy_compound_gigantic_page will turn the compound page
- * into a simple group of pages. After this the destructor does not
+ * page destroy_compound_gigantic_folio will turn the folio into a
+ * simple group of pages. After this the destructor does not
* apply.
*
* This handles the case where more than one ref is held when and
- * after update_and_free_page is called.
+ * after update_and_free_hugetlb_folio is called.
*
* In the case of demote we do not ref count the page as it will soon
* be turned into a page of smaller size.
*/
if (!demote)
- set_page_refcounted(page);
+ folio_ref_unfreeze(folio, 1);
if (hstate_is_gigantic(h))
- set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+ folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR);
else
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
+static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, false);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, false);
}
-static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
- __remove_hugetlb_page(h, page, adjust_surplus, true);
+ __remove_hugetlb_folio(h, folio, adjust_surplus, true);
}
-static void add_hugetlb_page(struct hstate *h, struct page *page,
+static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
bool adjust_surplus)
{
int zeroed;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
- VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
lockdep_assert_held(&hugetlb_lock);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&folio->lru);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
@@ -1529,21 +1529,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
h->surplus_huge_pages_node[nid]++;
}
- set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- set_page_private(page, 0);
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
+ folio_change_private(folio, NULL);
/*
- * We have to set HPageVmemmapOptimized again as above
- * set_page_private(page, 0) cleared it.
+ * We have to set hugetlb_vmemmap_optimized again as above
+ * folio_change_private(folio, NULL) cleared it.
*/
- SetHPageVmemmapOptimized(page);
+ folio_set_hugetlb_vmemmap_optimized(folio);
/*
- * This page is about to be managed by the hugetlb allocator and
+ * This folio is about to be managed by the hugetlb allocator and
* should have no users. Drop our reference, and check for others
* just in case.
*/
- zeroed = put_page_testzero(page);
- if (!zeroed)
+ zeroed = folio_put_testzero(folio);
+ if (unlikely(!zeroed))
/*
* It is VERY unlikely soneone else has taken a ref on
* the page. In this case, we simply return as the
@@ -1552,13 +1552,14 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
*/
return;
- arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ arch_clear_hugepage_flags(&folio->page);
+ enqueue_hugetlb_folio(h, folio);
}
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct folio *folio = page_folio(page);
struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
@@ -1568,7 +1569,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* If we don't know which subpages are hwpoisoned, we can't free
* the hugepage, so it's leaked intentionally.
*/
- if (HPageRawHwpUnreliable(page))
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
if (hugetlb_vmemmap_restore(h, page)) {
@@ -1578,7 +1579,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* page and put the page back on the hugetlb free list and treat
* as a surplus page.
*/
- add_hugetlb_page(h, page, true);
+ add_hugetlb_folio(h, folio, true);
spin_unlock_irq(&hugetlb_lock);
return;
}
@@ -1587,11 +1588,11 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* Move PageHWPoison flag from head page to the raw error pages,
* which makes any healthy subpages reusable.
*/
- if (unlikely(PageHWPoison(page)))
- hugetlb_clear_page_hwpoison(page);
+ if (unlikely(folio_test_hwpoison(folio)))
+ hugetlb_clear_page_hwpoison(&folio->page);
for (i = 0; i < pages_per_huge_page(h); i++) {
- subpage = nth_page(page, i);
+ subpage = folio_page(folio, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
@@ -1600,19 +1601,19 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
/*
* Non-gigantic pages demoted from CMA allocated gigantic pages
- * need to be given back to CMA in free_gigantic_page.
+ * need to be given back to CMA in free_gigantic_folio.
*/
if (hstate_is_gigantic(h) ||
- hugetlb_cma_page(page, huge_page_order(h))) {
- destroy_compound_gigantic_page(page, huge_page_order(h));
- free_gigantic_page(page, huge_page_order(h));
+ hugetlb_cma_folio(folio, huge_page_order(h))) {
+ destroy_compound_gigantic_folio(folio, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
} else {
__free_pages(page, huge_page_order(h));
}
}
/*
- * As update_and_free_page() can be called under any context, so we cannot
+ * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
* the vmemmap pages.
@@ -1641,8 +1642,9 @@ static void free_hpage_workfn(struct work_struct *work)
/*
* The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
* is going to trigger because a previous call to
- * remove_hugetlb_page() will set_compound_page_dtor(page,
- * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ * remove_hugetlb_folio() will call folio_set_compound_dtor
+ * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate()
+ * directly.
*/
h = size_to_hstate(page_size(page));
@@ -1659,11 +1661,11 @@ static inline void flush_free_hpage_work(struct hstate *h)
flush_work(&free_hpage_work);
}
-static void update_and_free_page(struct hstate *h, struct page *page,
+static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
bool atomic)
{
- if (!HPageVmemmapOptimized(page) || !atomic) {
- __update_and_free_page(h, page);
+ if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
+ __update_and_free_page(h, &folio->page);
return;
}
@@ -1674,16 +1676,18 @@ static void update_and_free_page(struct hstate *h, struct page *page,
* empty. Otherwise, schedule_work() had been called but the workfn
* hasn't retrieved the list yet.
*/
- if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
schedule_work(&free_hpage_work);
}
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
+ struct folio *folio;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page, false);
+ folio = page_folio(page);
+ update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
}
@@ -1751,17 +1755,17 @@ void free_huge_page(struct page *page)
h->resv_huge_pages++;
if (folio_test_hugetlb_temporary(folio)) {
- remove_hugetlb_page(h, page, false);
+ remove_hugetlb_folio(h, folio, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
- remove_hugetlb_page(h, page, true);
+ remove_hugetlb_folio(h, folio, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page, true);
+ update_and_free_hugetlb_folio(h, folio, true);
} else {
arch_clear_hugepage_flags(page);
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}
@@ -1780,35 +1784,33 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
hugetlb_vmemmap_optimize(h, &folio->page);
INIT_LIST_HEAD(&folio->lru);
- folio->_folio_dtor = HUGETLB_PAGE_DTOR;
+ folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
set_hugetlb_cgroup_rsvd(folio, NULL);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
- struct folio *folio = page_folio(page);
-
__prep_new_hugetlb_folio(h, folio);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
- bool demote)
+static bool __prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order, bool demote)
{
int i, j;
int nr_pages = 1 << order;
struct page *p;
- /* we rely on prep_new_huge_page to set the destructor */
- set_compound_order(page, order);
- __ClearPageReserved(page);
- __SetPageHead(page);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_compound_order(folio, order);
+ __folio_clear_reserved(folio);
+ __folio_set_head(folio);
for (i = 0; i < nr_pages; i++) {
- p = nth_page(page, i);
+ p = folio_page(folio, i);
/*
* For gigantic hugepages allocated through bootmem at
@@ -1850,43 +1852,41 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
VM_BUG_ON_PAGE(page_count(p), p);
}
if (i != 0)
- set_compound_head(p, page);
+ set_compound_head(p, &folio->page);
}
- atomic_set(compound_mapcount_ptr(page), -1);
- atomic_set(subpages_mapcount_ptr(page), 0);
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(folio_mapcount_ptr(folio), -1);
+ atomic_set(folio_subpages_mapcount_ptr(folio), 0);
+ atomic_set(folio_pincount_ptr(folio), 0);
return true;
out_error:
/* undo page modifications made above */
for (j = 0; j < i; j++) {
- p = nth_page(page, j);
+ p = folio_page(folio, j);
if (j != 0)
clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
for (; j < nr_pages; j++) {
- p = nth_page(page, j);
+ p = folio_page(folio, j);
__ClearPageReserved(p);
}
- set_compound_order(page, 0);
-#ifdef CONFIG_64BIT
- page[1].compound_nr = 0;
-#endif
- __ClearPageHead(page);
+ folio_set_compound_order(folio, 0);
+ __folio_clear_head(folio);
return false;
}
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_folio(struct folio *folio,
+ unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, false);
+ return __prep_compound_gigantic_folio(folio, order, false);
}
-static bool prep_compound_gigantic_page_for_demote(struct page *page,
+static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
unsigned int order)
{
- return __prep_compound_gigantic_page(page, order, true);
+ return __prep_compound_gigantic_folio(folio, order, true);
}
/*
@@ -1951,7 +1951,7 @@ pgoff_t hugetlb_basepage_index(struct page *page)
return (index << compound_order(page_head)) + compound_idx;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
@@ -1989,11 +1989,6 @@ retry:
page = NULL;
}
- if (page)
- __count_vm_event(HTLB_BUDDY_PGALLOC);
- else
- __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
* indicates an overall state change. Clear bit so that we resume
@@ -2010,7 +2005,13 @@ retry:
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
- return page;
+ if (!page) {
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ return NULL;
+ }
+
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ return page_folio(page);
}
/*
@@ -2020,29 +2021,28 @@ retry:
* Note that returned page is 'frozen': ref count of head page and all tail
* pages is zero.
*/
-static struct page *alloc_fresh_huge_page(struct hstate *h,
+static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
bool retry = false;
retry:
if (hstate_is_gigantic(h))
- page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
else
- page = alloc_buddy_huge_page(h, gfp_mask,
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
nid, nmask, node_alloc_noretry);
- if (!page)
+ if (!folio)
return NULL;
-
if (hstate_is_gigantic(h)) {
- if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
/*
* Rare failure to convert pages to compound page.
* Free pages and try again - ONCE!
*/
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
if (!retry) {
retry = true;
goto retry;
@@ -2050,9 +2050,9 @@ retry:
return NULL;
}
}
- prep_new_huge_page(h, page, page_to_nid(page));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- return page;
+ return folio;
}
/*
@@ -2062,23 +2062,20 @@ retry:
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
- struct page *page;
+ struct folio *folio;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
- node_alloc_noretry);
- if (page)
- break;
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ nodes_allowed, node_alloc_noretry);
+ if (folio) {
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
+ return 1;
+ }
}
- if (!page)
- return 0;
-
- free_huge_page(page); /* free it into the hugepage allocator */
-
- return 1;
+ return 0;
}
/*
@@ -2094,6 +2091,7 @@ static struct page *remove_pool_huge_page(struct hstate *h,
{
int nr_nodes, node;
struct page *page = NULL;
+ struct folio *folio;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
@@ -2105,7 +2103,8 @@ static struct page *remove_pool_huge_page(struct hstate *h,
!list_empty(&h->hugepage_freelists[node])) {
page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
- remove_hugetlb_page(h, page, acct_surplus);
+ folio = page_folio(page);
+ remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
@@ -2130,21 +2129,21 @@ static struct page *remove_pool_huge_page(struct hstate *h,
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ struct folio *folio = page_folio(page);
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
- if (!PageHuge(page))
+ if (!folio_test_hugetlb(folio))
return 0;
spin_lock_irq(&hugetlb_lock);
- if (!PageHuge(page)) {
+ if (!folio_test_hugetlb(folio)) {
rc = 0;
goto out;
}
- if (!page_count(page)) {
- struct page *head = compound_head(page);
- struct hstate *h = page_hstate(head);
+ if (!folio_ref_count(folio)) {
+ struct hstate *h = folio_hstate(folio);
if (!available_huge_pages(h))
goto out;
@@ -2152,7 +2151,7 @@ retry:
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
- if (unlikely(!HPageFreed(head))) {
+ if (unlikely(!folio_test_hugetlb_freed(folio))) {
spin_unlock_irq(&hugetlb_lock);
cond_resched();
@@ -2167,24 +2166,24 @@ retry:
goto retry;
}
- remove_hugetlb_page(h, head, false);
+ remove_hugetlb_folio(h, folio, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
/*
- * Normally update_and_free_page will allocate required vmemmmap
- * before freeing the page. update_and_free_page will fail to
+ * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
+ * before freeing the page. update_and_free_hugtlb_folio will fail to
* free the page if it can not allocate required vmemmap. We
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = hugetlb_vmemmap_restore(h, head);
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
if (!rc) {
- update_and_free_page(h, head, false);
+ update_and_free_hugetlb_folio(h, folio, false);
} else {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_page(h, head, false);
+ add_hugetlb_folio(h, folio, false);
h->max_huge_pages++;
spin_unlock_irq(&hugetlb_lock);
}
@@ -2235,7 +2234,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
if (hstate_is_gigantic(h))
return NULL;
@@ -2245,8 +2244,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
spin_lock_irq(&hugetlb_lock);
@@ -2258,43 +2257,42 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
spin_unlock_irq(&hugetlb_lock);
- free_huge_page(page);
+ free_huge_page(&folio->page);
return NULL;
}
h->surplus_huge_pages++;
- h->surplus_huge_pages_node[page_to_nid(page)]++;
+ h->surplus_huge_pages_node[folio_nid(folio)]++;
out_unlock:
spin_unlock_irq(&hugetlb_lock);
- return page;
+ return &folio->page;
}
static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
- struct page *page;
+ struct folio *folio;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
- if (!page)
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (!folio)
return NULL;
/* fresh huge pages are frozen */
- set_page_refcounted(page);
-
+ folio_ref_unfreeze(folio, 1);
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetHPageTemporary(page);
+ folio_set_hugetlb_temporary(folio);
- return page;
+ return &folio->page;
}
/*
@@ -2436,7 +2434,7 @@ retry:
if ((--needed) < 0)
break;
/* Add the page to the hugetlb allocator */
- enqueue_huge_page(h, page);
+ enqueue_hugetlb_folio(h, page_folio(page));
}
free:
spin_unlock_irq(&hugetlb_lock);
@@ -2743,19 +2741,18 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
}
/*
- * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
+ * the old one
* @h: struct hstate old page belongs to
- * @old_page: Old page to dissolve
+ * @old_folio: Old folio to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
-static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
- struct list_head *list)
+static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
+ struct folio *old_folio, struct list_head *list)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- struct folio *old_folio = page_folio(old_page);
int nid = folio_nid(old_folio);
- struct page *new_page;
struct folio *new_folio;
int ret = 0;
@@ -2766,26 +2763,25 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
- new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
- if (!new_page)
+ new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
+ if (!new_folio)
return -ENOMEM;
- new_folio = page_folio(new_page);
__prep_new_hugetlb_folio(h, new_folio);
retry:
spin_lock_irq(&hugetlb_lock);
if (!folio_test_hugetlb(old_folio)) {
/*
- * Freed from under us. Drop new_page too.
+ * Freed from under us. Drop new_folio too.
*/
goto free_new;
} else if (folio_ref_count(old_folio)) {
/*
- * Someone has grabbed the page, try to isolate it here.
+ * Someone has grabbed the folio, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- ret = isolate_hugetlb(old_page, list);
+ ret = isolate_hugetlb(&old_folio->page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!folio_test_hugetlb_freed(old_folio)) {
@@ -2802,23 +2798,23 @@ retry:
* Ok, old_page is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
* incremented again when calling __prep_account_new_huge_page()
- * and enqueue_huge_page() for new_page. The counters will remain
- * stable since this happens under the lock.
+ * and enqueue_hugetlb_folio() for new_folio. The counters will
+ * remain stable since this happens under the lock.
*/
- remove_hugetlb_page(h, old_page, false);
+ remove_hugetlb_folio(h, old_folio, false);
/*
* Ref count on new page is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
- enqueue_huge_page(h, new_page);
+ enqueue_hugetlb_folio(h, new_folio);
/*
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page, false);
+ update_and_free_hugetlb_folio(h, old_folio, false);
}
return ret;
@@ -2827,7 +2823,7 @@ free_new:
spin_unlock_irq(&hugetlb_lock);
/* Page has a zero ref count, but needs a ref to be freed */
folio_ref_unfreeze(new_folio, 1);
- update_and_free_page(h, new_page, false);
+ update_and_free_hugetlb_folio(h, new_folio, false);
return ret;
}
@@ -2863,7 +2859,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
ret = 0;
else if (!folio_ref_count(folio))
- ret = alloc_and_dissolve_huge_page(h, &folio->page, list);
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
return ret;
}
@@ -2938,7 +2934,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
* a reservation exists for the allocation.
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
-
if (!page) {
spin_unlock_irq(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
@@ -3049,17 +3044,18 @@ static void __init gather_bootmem_prealloc(void)
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
+ struct folio *folio = page_folio(page);
struct hstate *h = m->hstate;
VM_BUG_ON(!hstate_is_gigantic(h));
- WARN_ON(page_count(page) != 1);
- if (prep_compound_gigantic_page(page, huge_page_order(h))) {
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
+ WARN_ON(folio_ref_count(folio) != 1);
+ if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ WARN_ON(folio_test_reserved(folio));
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
free_huge_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
- free_gigantic_page(page, huge_page_order(h));
+ free_gigantic_folio(folio, huge_page_order(h));
}
/*
@@ -3081,14 +3077,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
if (!alloc_bootmem_huge_page(h, nid))
break;
} else {
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
- page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
&node_states[N_MEMORY], NULL);
- if (!page)
+ if (!folio)
break;
- free_huge_page(page); /* free it into the hugepage allocator */
+ free_huge_page(&folio->page); /* free it into the hugepage allocator */
}
cond_resched();
}
@@ -3233,7 +3229,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
goto out;
if (PageHighMem(page))
continue;
- remove_hugetlb_page(h, page, false);
+ remove_hugetlb_folio(h, page_folio(page), false);
list_add(&page->lru, &page_list);
}
}
@@ -3438,12 +3434,13 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct folio *folio = page_folio(page);
struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
- remove_hugetlb_page_for_demote(h, page, false);
+ remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
rc = hugetlb_vmemmap_restore(h, page);
@@ -3451,15 +3448,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
set_page_refcounted(page);
- add_hugetlb_page(h, page, false);
+ add_hugetlb_folio(h, page_folio(page), false);
return rc;
}
/*
- * Use destroy_compound_hugetlb_page_for_demote for all huge page
+ * Use destroy_compound_hugetlb_folio_for_demote for all huge page
* sizes as it will not ref count pages.
*/
- destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
/*
* Taking target hstate mutex synchronizes with set_max_huge_pages.
@@ -3473,13 +3470,14 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
subpage = nth_page(page, i);
+ folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(subpage,
+ prep_compound_gigantic_folio_for_demote(folio,
target_hstate->order);
else
prep_compound_page(subpage, target_hstate->order);
set_page_private(subpage, 0);
- prep_new_huge_page(target_hstate, subpage, nid);
+ prep_new_hugetlb_folio(target_hstate, folio, nid);
free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -4816,7 +4814,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
/*
* For shared mappings the vma lock must be held before
- * calling huge_pte_offset in the src vma. Otherwise, the
+ * calling hugetlb_walk() in the src vma. Otherwise, the
* returned ptep could go away if part of a shared pmd and
* another thread calls huge_pmd_unshare.
*/
@@ -4826,7 +4824,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
+ src_pte = hugetlb_walk(src_vma, addr, sz);
if (!src_pte) {
addr |= last_addr_mask;
continue;
@@ -5030,7 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
- src_pte = huge_pte_offset(mm, old_addr, sz);
+ src_pte = hugetlb_walk(vma, old_addr, sz);
if (!src_pte) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
@@ -5093,7 +5091,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -5406,7 +5404,7 @@ retry_avoidcopy:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -5444,7 +5442,7 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -5826,22 +5824,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -5857,7 +5839,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
*
- * ptep could have already be assigned via huge_pte_offset. That
+ * ptep could have already be assigned via hugetlb_walk(). That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
@@ -5888,8 +5870,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release fault lock first because the vma lock is
+ * needed to guard the huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -6234,10 +6230,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
-retry:
- pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
if (!pte)
- return NULL;
+ goto out_unlock;
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
@@ -6257,19 +6253,11 @@ retry:
page = NULL;
goto out;
}
- } else {
- if (is_hugetlb_entry_migration(entry)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(pte, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
}
out:
spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
return page;
}
@@ -6300,6 +6288,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -6307,8 +6296,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6324,6 +6313,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -6345,6 +6335,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
@@ -6407,6 +6399,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
@@ -6436,6 +6429,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
@@ -6447,6 +6441,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
i += refs;
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -6493,7 +6488,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, psize);
+ ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -6871,12 +6866,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
*end = ALIGN(*end, PUD_SIZE);
}
-static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
- vma->vm_private_data;
-}
-
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_flags_pmd(vma)) {
@@ -7042,8 +7031,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -7348,7 +7337,6 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
int old_nid = folio_nid(old_folio);
int new_nid = folio_nid(new_folio);
-
folio_set_hugetlb_temporary(old_folio);
folio_clear_hugetlb_temporary(new_folio);
@@ -7402,7 +7390,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index a84491bc4867..ea8cf1310b1e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -541,6 +541,18 @@ static inline bool kasan_arch_is_ready(void) { return true; }
#error kasan_arch_is_ready only works in KASAN generic outline mode!
#endif
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_kunit_test_suite_start(void);
+void kasan_kunit_test_suite_end(void);
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_kunit_test_suite_start(void) { }
+static inline void kasan_kunit_test_suite_end(void) { }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
bool kasan_save_enable_multi_shot(void);
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index b8ff39324ed7..74cd80c12b25 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -76,6 +76,9 @@ static int kasan_suite_init(struct kunit_suite *suite)
return -1;
}
+ /* Stop failing KUnit tests on KASAN reports. */
+ kasan_kunit_test_suite_start();
+
/*
* Temporarily enable multi-shot mode. Otherwise, KASAN would only
* report the first detected bug and panic the kernel if panic_on_warn
@@ -94,6 +97,7 @@ static int kasan_suite_init(struct kunit_suite *suite)
static void kasan_suite_exit(struct kunit_suite *suite)
{
+ kasan_kunit_test_suite_end();
kasan_restore_multi_shot(multishot);
for_each_kernel_tracepoint(unregister_tracepoints, NULL);
tracepoint_synchronize_unregister();
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 6522619a9762..1d02757e90a3 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -9,6 +9,7 @@
* Andrey Konovalov <andreyknvl@gmail.com>
*/
+#include <kunit/test.h>
#include <linux/bitops.h>
#include <linux/ftrace.h>
#include <linux/init.h>
@@ -112,10 +113,62 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
#endif
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+/*
+ * Whether the KASAN KUnit test suite is currently being executed.
+ * Updated in kasan_test.c.
+ */
+bool kasan_kunit_executing;
+
+void kasan_kunit_test_suite_start(void)
+{
+ WRITE_ONCE(kasan_kunit_executing, true);
+}
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start);
+
+void kasan_kunit_test_suite_end(void)
+{
+ WRITE_ONCE(kasan_kunit_executing, false);
+}
+EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end);
+
+static bool kasan_kunit_test_suite_executing(void)
+{
+ return READ_ONCE(kasan_kunit_executing);
+}
+
+#else /* CONFIG_KASAN_KUNIT_TEST */
+
+static inline bool kasan_kunit_test_suite_executing(void) { return false; }
+
+#endif /* CONFIG_KASAN_KUNIT_TEST */
+
+#if IS_ENABLED(CONFIG_KUNIT)
+
+static void fail_non_kasan_kunit_test(void)
+{
+ struct kunit *test;
+
+ if (kasan_kunit_test_suite_executing())
+ return;
+
+ test = current->kunit_test;
+ if (test)
+ kunit_set_failure(test);
+}
+
+#else /* CONFIG_KUNIT */
+
+static inline void fail_non_kasan_kunit_test(void) { }
+
+#endif /* CONFIG_KUNIT */
+
static DEFINE_SPINLOCK(report_lock);
static void start_report(unsigned long *flags, bool sync)
{
+ fail_non_kasan_kunit_test();
/* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
disable_trace_on_warning();
/* Do not allow LOCKDEP mangling KASAN reports. */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6fc4d80d991a..5cb401aa2b9d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1744,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
{
struct address_space *mapping = file->f_mapping;
struct page *hpage;
- pgoff_t index, end = start + HPAGE_PMD_NR;
+ pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
- int nr;
+ int nr = 0;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -2013,6 +2013,7 @@ xa_unlocked:
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
+ struct folio *folio;
/*
* Replacing old pages with new one has succeeded, now we
@@ -2040,11 +2041,13 @@ xa_unlocked:
index++;
}
- SetPageUptodate(hpage);
- page_ref_add(hpage, HPAGE_PMD_NR - 1);
+ folio = page_folio(hpage);
+ folio_mark_uptodate(folio);
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+
if (is_shmem)
- set_page_dirty(hpage);
- lru_cache_add(hpage);
+ folio_mark_dirty(folio);
+ folio_add_lru(folio);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -2102,7 +2105,8 @@ out:
mem_cgroup_uncharge(page_folio(hpage));
put_page(hpage);
}
- /* TODO: tracepoints */
+
+ trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
return result;
}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index 271f135f97a1..770fe02904f3 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -81,12 +81,16 @@ DECLARE_METADATA_PTR_GETTER(8);
* Handle a memory store performed by inline assembly. KMSAN conservatively
* attempts to unpoison the outputs of asm() directives to prevent false
* positives caused by missed stores.
+ *
+ * __msan_instrument_asm_store() may be called for inline assembly code when
+ * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure
+ * the memory written to in these cases is also marked as initialized.
*/
void __msan_instrument_asm_store(void *addr, uintptr_t size)
{
unsigned long ua_flags;
- if (!kmsan_enabled || kmsan_in_runtime())
+ if (!kmsan_enabled)
return;
ua_flags = user_access_save();
@@ -103,10 +107,8 @@ void __msan_instrument_asm_store(void *addr, uintptr_t size)
user_access_restore(ua_flags);
return;
}
- kmsan_enter_runtime();
/* Unpoisoning the memory on best effort. */
kmsan_internal_unpoison_memory(addr, size, /*checked*/ false);
- kmsan_leave_runtime();
user_access_restore(ua_flags);
}
EXPORT_SYMBOL(__msan_instrument_asm_store);
diff --git a/mm/ksm.c b/mm/ksm.c
index a71245241d22..dd02780c387f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -39,6 +39,7 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -419,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret;
+
+ if (pmd_leaf(*pmd) || !pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (pte_present(*pte)) {
+ page = vm_normal_page(walk->vma, addr, *pte);
+ } else if (!pte_none(*pte)) {
+ swp_entry_t entry = pte_to_swp_entry(*pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ ret = page && PageKsm(page);
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+};
+
/*
- * We use break_ksm to break COW on a ksm page: it's a stripped down
- *
- * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
- * put_page(page);
+ * We use break_ksm to break COW on a ksm page by triggering unsharing,
+ * such that the ksm page will get replaced by an exclusive anonymous page.
*
- * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem, where we would not want to touch it.
*
- * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
vm_fault_t ret = 0;
do {
+ int ksm_page;
+
cond_resched();
- page = follow_page(vma, addr,
- FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page))
- break;
- if (PageKsm(page))
- ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
- NULL);
- else
- ret = VM_FAULT_WRITE;
- put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1,
+ &break_ksm_ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
+ if (!ksm_page)
+ return 0;
+ ret = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
- * We must loop because handle_mm_fault() may back out if there's
- * any difficulty e.g. if pte accessed bit gets updated concurrently.
- *
- * VM_FAULT_WRITE is what we have been hoping for: it indicates that
- * COW has been broken, even if the vma does not permit VM_WRITE;
- * but note that a concurrent fault might break PageKsm for us.
+ * We must loop until we no longer find a KSM page because
+ * handle_mm_fault() may back out if there's any difficulty e.g. if
+ * pte accessed bit gets updated concurrently.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
diff --git a/mm/madvise.c b/mm/madvise.c
index 71155fec5df6..90217646c0c0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,6 +40,7 @@
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
+ bool can_pageout_file;
};
/*
@@ -325,6 +326,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
struct madvise_walk_private *private = walk->private;
struct mmu_gather *tlb = private->tlb;
bool pageout = private->pageout;
+ bool pageout_anon_only = pageout && !private->can_pageout_file;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
@@ -361,6 +363,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (page_mapcount(page) != 1)
goto huge_unlock;
+ if (pageout_anon_only && !PageAnon(page))
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
@@ -429,6 +434,8 @@ regular_page:
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
break;
+ if (pageout_anon_only && !PageAnon(page))
+ break;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
@@ -456,6 +463,9 @@ regular_page:
if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
+ if (pageout_anon_only && !PageAnon(page))
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
@@ -538,11 +548,13 @@ static long madvise_cold(struct vm_area_struct *vma,
static void madvise_pageout_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ bool can_pageout_file)
{
struct madvise_walk_private walk_private = {
.pageout = true,
.tlb = tlb,
+ .can_pageout_file = can_pageout_file,
};
tlb_start_vma(tlb, vma);
@@ -550,10 +562,8 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
-static inline bool can_do_pageout(struct vm_area_struct *vma)
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
{
- if (vma_is_anonymous(vma))
- return true;
if (!vma->vm_file)
return false;
/*
@@ -573,17 +583,23 @@ static long madvise_pageout(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
+ bool can_pageout_file;
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
- if (!can_do_pageout(vma))
- return 0;
+ /*
+ * If the VMA belongs to a private file mapping, there can be private
+ * dirty pages which can be paged out if even this process is neither
+ * owner nor write capable of the file. Cache the file access check
+ * here and use it later during page walk.
+ */
+ can_pageout_file = can_do_file_pageout(vma);
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr, can_pageout_file);
tlb_finish_mmu(&tlb);
return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 815d2ff05c62..aad226daf41b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3213,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
delayacct_wpcopy_end();
- return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
+ return 0;
oom_free_new:
put_page(new_page);
oom:
@@ -3277,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- vm_fault_t ret = VM_FAULT_WRITE;
+ vm_fault_t ret = 0;
get_page(vmf->page);
@@ -3430,7 +3430,7 @@ reuse:
return 0;
}
wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
+ return 0;
}
copy:
/*
@@ -3944,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (vmf->flags & FAULT_FLAG_WRITE) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
}
rmap_flags |= RMAP_EXCLUSIVE;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index e50b3c42e6ad..36be77470cc8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -223,7 +223,14 @@ static bool remove_migration_pte(struct folio *folio,
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else
- /* NOTE: mk_pte can have write bit set */
+ /*
+ * NOTE: mk_pte() can have write bit set per memory
+ * type (e.g. shmem), or pte_mkdirty() per archs
+ * (e.g., sparc64). If this is a read migration
+ * entry, we need to make sure when we recover the
+ * pte from migration entry to present entry the
+ * write bit is cleared.
+ */
pte = pte_wrprotect(pte);
if (pte_swp_uffd_wp(*pvmw.pte)) {
@@ -335,24 +342,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
+ /*
+ * The vma read lock must be taken, which will be released before
+ * the function returns. It makes sure the pgtable page (along
+ * with its spin lock) not be freed in parallel.
+ */
+ hugetlb_vma_assert_locked(vma);
+
spin_lock(ptl);
pte = huge_ptep_get(ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte)))
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
- else
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ }
}
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
- __migration_entry_wait_huge(pte, ptl);
+ __migration_entry_wait_huge(vma, pte, ptl);
}
#endif
diff --git a/mm/mmap.c b/mm/mmap.c
index 872059752f66..3158f37bbfd8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1779,9 +1779,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Ensures that larger anonymous mappings are THP aligned. */
- get_area = thp_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ab9dd29ef7e..0745aedebb37 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1702,6 +1702,11 @@ static void __free_pages_ok(struct page *page, unsigned int order,
if (!free_pages_prepare(page, order, true, fpi_flags))
return;
+ /*
+ * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+ * is used to avoid calling get_pfnblock_migratetype() under the lock.
+ * This will reduce the lock holding time.
+ */
migratetype = get_pfnblock_migratetype(page, pfn);
spin_lock_irqsave(&zone->lock, flags);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3c..bb782dea4b42 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -169,10 +169,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pte)
return not_found(pvmw);
+ hugetlb_vma_lock_read(vma);
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
- if (!pvmw->pte)
+ pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
+ if (!pvmw->pte) {
+ hugetlb_vma_unlock_read(vma);
return false;
+ }
pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
if (!check_pte(pvmw))
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2ff3a5bebceb..cb23f8a15c13 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -302,18 +302,18 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ hugetlb_vma_lock_read(vma);
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+ pte = hugetlb_walk(vma, addr & hmask, sz);
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
err = ops->pte_hole(addr, next, -1, walk);
-
if (err)
break;
} while (addr = next, addr != end);
+ hugetlb_vma_unlock_read(vma);
return err;
}
@@ -517,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
@@ -526,18 +546,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
.vma = vma,
.private = private,
};
- int err;
if (!walk.mm)
return -EINVAL;
mmap_assert_locked(walk.mm);
-
- err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
- if (err > 0)
- return 0;
- if (err < 0)
- return err;
return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 46ae542118c0..c5398a5960d0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
+void __weak __meminit pmd_init(void *addr)
+{
+}
+
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
@@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pmd_init(p);
pud_populate(&init_mm, pud, p);
}
return pud;
}
+void __weak __meminit pud_init(void *addr)
+{
+}
+
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
@@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
+ pud_init(p);
p4d_populate(&init_mm, p4d, p);
}
return p4d;
@@ -285,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
return vmemmap_populate_range(start, end, node, altmap, NULL);
}
+void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
+{
+}
+
+int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ return 0;
+}
+
+int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(READ_ONCE(*pmd))) {
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+ if (p) {
+ vmemmap_set_pmd(pmd, p, node, addr, next);
+ continue;
+ } else if (altmap) {
+ /*
+ * No fallback: In any case we care about, the
+ * altmap should be reasonably sized and aligned
+ * such that vmemmap_alloc_block_buf() will always
+ * succeed. For consistency with the PTE case,
+ * return an error here as failure could indicate
+ * a configuration issue with the size of the altmap.
+ */
+ return -ENOMEM;
+ }
+ } else if (vmemmap_check_pmd(pmd, node, addr, next))
+ continue;
+ if (vmemmap_populate_basepages(addr, next, node, altmap))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
/*
* For compound pages bigger than section size (e.g. x86 1G compound
* pages with 2M subsection size) fill the rest of sections as tail
diff --git a/mm/truncate.c b/mm/truncate.c
index c7bfd247a651..7b4ea4c4a46b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -565,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
- * sitting in the lru_cache_add() pagevecs.
+ * sitting in the folio_add_lru() pagevecs.
*/
static int invalidate_complete_folio2(struct address_space *mapping,
struct folio *folio)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3a8ff47943d5..0499907b6f1a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -66,6 +66,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
+ struct folio *folio;
struct inode *inode;
pgoff_t offset, max_off;
@@ -113,14 +114,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
if (!pte_none_mostly(*dst_pte))
goto out_unlock;
+ folio = page_folio(page);
if (page_in_cache) {
/* Usually, cache pages are already added to LRU */
if (newly_allocated)
- lru_cache_add(page);
+ folio_add_lru(folio);
page_add_file_rmap(page, dst_vma, false);
} else {
page_add_new_anon_rmap(page, dst_vma, dst_addr);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ folio_add_lru_vma(folio, dst_vma);
}
/*
diff --git a/mm/workingset.c b/mm/workingset.c
index d2d02978588c..1a86645b7b3c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -492,7 +492,10 @@ void workingset_refault(struct folio *folio, void *shadow)
/* Folio was active prior to eviction */
if (workingset) {
folio_set_workingset(folio);
- /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ /*
+ * XXX: Move to folio_add_lru() when it supports new vs
+ * putback
+ */
lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
index cf71da10d04e..a4de0c317ac7 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -68,9 +68,6 @@
* Structures
*****************/
struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
enum buddy {
HEADLESS = 0,
@@ -138,8 +135,6 @@ struct z3fold_header {
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
* @zpool: zpool driver
* @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
@@ -158,7 +153,6 @@ struct z3fold_pool {
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
- const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
@@ -907,13 +901,11 @@ out_fail:
* z3fold_create_pool() - create a new z3fold pool
* @name: pool name
* @gfp: gfp flags when allocating the z3fold pool structure
- * @ops: user-defined operations for the z3fold pool
*
* Return: pointer to the new z3fold pool or NULL if the metadata allocation
* failed.
*/
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
- const struct z3fold_ops *ops)
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
{
struct z3fold_pool *pool = NULL;
int i, cpu;
@@ -949,7 +941,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
if (!pool->release_wq)
goto out_wq;
INIT_WORK(&pool->work, free_pages_work);
- pool->ops = ops;
return pool;
out_wq:
@@ -1230,10 +1221,6 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || retries == 0) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
for (i = 0; i < retries; i++) {
if (list_empty(&pool->lru)) {
spin_unlock(&pool->lock);
@@ -1319,17 +1306,17 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
}
/* Issue the eviction callback(s) */
if (middle_handle) {
- ret = pool->ops->evict(pool, middle_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, middle_handle);
if (ret)
goto next;
}
if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, first_handle);
if (ret)
goto next;
}
if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, last_handle);
if (ret)
goto next;
}
@@ -1593,26 +1580,13 @@ static const struct movable_operations z3fold_mops = {
* zpool
****************/
-static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct z3fold_ops z3fold_zpool_ops = {
- .evict = z3fold_zpool_evict
-};
-
static void *z3fold_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct z3fold_pool *pool;
- pool = z3fold_create_pool(name, gfp,
- zpool_ops ? &z3fold_zpool_ops : NULL);
+ pool = z3fold_create_pool(name, gfp);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zbud.c b/mm/zbud.c
index 6348932430b8..3acd26193920 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -74,10 +74,6 @@
struct zbud_pool;
-struct zbud_ops {
- int (*evict)(struct zbud_pool *pool, unsigned long handle);
-};
-
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
@@ -90,8 +86,6 @@ struct zbud_ops {
* @lru: list tracking the zbud pages in LRU order by most recently
* added buddy.
* @pages_nr: number of zbud pages in the pool.
- * @ops: pointer to a structure of user defined operations specified at
- * pool creation time.
* @zpool: zpool driver
* @zpool_ops: zpool operations structure with an evict callback
*
@@ -110,7 +104,6 @@ struct zbud_pool {
};
struct list_head lru;
u64 pages_nr;
- const struct zbud_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
};
@@ -212,12 +205,11 @@ static int num_free_chunks(struct zbud_header *zhdr)
/**
* zbud_create_pool() - create a new zbud pool
* @gfp: gfp flags when allocating the zbud pool structure
- * @ops: user-defined operations for the zbud pool
*
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp)
{
struct zbud_pool *pool;
int i;
@@ -231,7 +223,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
INIT_LIST_HEAD(&pool->buddied);
INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
- pool->ops = ops;
return pool;
}
@@ -419,8 +410,7 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
unsigned long first_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
- if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
- retries == 0) {
+ if (list_empty(&pool->lru)) {
spin_unlock(&pool->lock);
return -EINVAL;
}
@@ -444,12 +434,12 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
/* Issue the eviction callback(s) */
if (first_handle) {
- ret = pool->ops->evict(pool, first_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, first_handle);
if (ret)
goto next;
}
if (last_handle) {
- ret = pool->ops->evict(pool, last_handle);
+ ret = pool->zpool_ops->evict(pool->zpool, last_handle);
if (ret)
goto next;
}
@@ -524,25 +514,13 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool)
* zpool
****************/
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
- .evict = zbud_zpool_evict
-};
-
static void *zbud_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
- pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ pool = zbud_create_pool(gfp);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zpool.c b/mm/zpool.c
index f46c0d5e766c..571f5c5031dd 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,9 +21,6 @@
struct zpool {
struct zpool_driver *driver;
void *pool;
- const struct zpool_ops *ops;
- bool evictable;
- bool can_sleep_mapped;
};
static LIST_HEAD(drivers_head);
@@ -177,9 +174,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
- zpool->ops = ops;
- zpool->evictable = driver->shrink && ops && ops->evict;
- zpool->can_sleep_mapped = driver->sleep_mapped;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -380,7 +374,7 @@ u64 zpool_get_total_size(struct zpool *zpool)
*/
bool zpool_evictable(struct zpool *zpool)
{
- return zpool->evictable;
+ return zpool->driver->shrink;
}
/**
@@ -398,7 +392,7 @@ bool zpool_evictable(struct zpool *zpool)
*/
bool zpool_can_sleep_mapped(struct zpool *zpool)
{
- return zpool->can_sleep_mapped;
+ return zpool->driver->sleep_mapped;
}
MODULE_LICENSE("GPL");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 78feda34ad9a..9445bee6b014 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -33,8 +33,7 @@
/*
* lock ordering:
* page_lock
- * pool->migrate_lock
- * class->lock
+ * pool->lock
* zspage->lock
*/
@@ -192,7 +191,6 @@ static const int fullness_threshold_frac = 4;
static size_t huge_class_size;
struct size_class {
- spinlock_t lock;
struct list_head fullness_list[NR_ZS_FULLNESS];
/*
* Size of objects stored in this class. Must be multiple
@@ -241,14 +239,20 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
+#ifdef CONFIG_ZPOOL
+ /* List tracking the zspages in LRU order by most recently added object */
+ struct list_head lru;
+ struct zpool *zpool;
+ const struct zpool_ops *zpool_ops;
+#endif
+
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- /* protect page/zspage migration */
- rwlock_t migrate_lock;
+ spinlock_t lock;
};
struct zspage {
@@ -263,10 +267,17 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
+
+#ifdef CONFIG_ZPOOL
+ /* links the zspage to the lru list in the pool */
+ struct list_head lru;
+ bool under_reclaim;
+ /* list of unfreed handles whose objects have been reclaimed */
+ unsigned long *deferred_handles;
+#endif
+
struct zs_pool *pool;
-#ifdef CONFIG_COMPACTION
rwlock_t lock;
-#endif
};
struct mapping_area {
@@ -287,10 +298,11 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-#ifdef CONFIG_COMPACTION
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage);
static void migrate_write_lock_nested(struct zspage *zspage);
static void migrate_write_unlock(struct zspage *zspage);
@@ -298,9 +310,6 @@ static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static void migrate_lock_init(struct zspage *zspage) {}
-static void migrate_read_lock(struct zspage *zspage) {}
-static void migrate_read_unlock(struct zspage *zspage) {}
static void migrate_write_lock(struct zspage *zspage) {}
static void migrate_write_lock_nested(struct zspage *zspage) {}
static void migrate_write_unlock(struct zspage *zspage) {}
@@ -355,7 +364,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
kmem_cache_free(pool->zspage_cachep, zspage);
}
-/* class->lock(which owns the handle) synchronizes races */
+/* pool->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
*(unsigned long *)handle = obj;
@@ -374,7 +383,14 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
* different contexts and its caller must provide a valid
* gfp mask.
*/
- return zs_create_pool(name);
+ struct zs_pool *pool = zs_create_pool(name);
+
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+
+ return pool;
}
static void zs_zpool_destroy(void *pool)
@@ -396,6 +412,27 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
+
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zs_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -434,6 +471,7 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
+ .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -452,7 +490,7 @@ static __maybe_unused int is_first_page(struct page *page)
return PagePrivate(page);
}
-/* Protected by class->lock */
+/* Protected by pool->lock */
static inline int get_zspage_inuse(struct zspage *zspage)
{
return zspage->inuse;
@@ -597,13 +635,13 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
if (class->index != i)
continue;
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
freeable = zs_can_compact(class);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
objs_per_zspage = class->objs_per_zspage;
pages_used = obj_allocated / objs_per_zspage *
@@ -907,6 +945,25 @@ unlock:
return 0;
}
+#ifdef CONFIG_ZPOOL
+/*
+ * Free all the deferred handles whose objects are freed in zs_free.
+ */
+static void free_handles(struct zs_pool *pool, struct zspage *zspage)
+{
+ unsigned long handle = (unsigned long)zspage->deferred_handles;
+
+ while (handle) {
+ unsigned long nxt_handle = handle_to_obj(handle);
+
+ cache_free_handle(pool, handle);
+ handle = nxt_handle;
+ }
+}
+#else
+static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -916,11 +973,14 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
get_zspage_mapping(zspage, &class_idx, &fg);
- assert_spin_locked(&class->lock);
+ assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_EMPTY);
+ /* Free all deferred handles from zs_free */
+ free_handles(pool, zspage);
+
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -956,6 +1016,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
}
remove_zspage(class, zspage, ZS_EMPTY);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
}
@@ -1001,6 +1064,12 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
off %= PAGE_SIZE;
}
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&zspage->lru);
+ zspage->under_reclaim = false;
+ zspage->deferred_handles = NULL;
+#endif
+
set_freeobj(zspage, 0);
}
@@ -1268,19 +1337,44 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
BUG_ON(in_interrupt());
/* It guarantees it can get zspage from handle safely */
- read_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
+#ifdef CONFIG_ZPOOL
+ /*
+ * Move the zspage to front of pool's LRU.
+ *
+ * Note that this is swap-specific, so by definition there are no ongoing
+ * accesses to the memory while the page is swapped out that would make
+ * it "hot". A new entry is hot, then ages to the tail until it gets either
+ * written back or swaps back in.
+ *
+ * Furthermore, map is also called during writeback. We must not put an
+ * isolated page on the LRU mid-reclaim.
+ *
+ * As a result, only update the LRU when the page is mapped for write
+ * when it's first instantiated.
+ *
+ * This is a deviation from the other backends, which perform this update
+ * in the allocation function (zbud_alloc, z3fold_alloc).
+ */
+ if (mm == ZS_MM_WO) {
+ if (!list_empty(&zspage->lru))
+ list_del(&zspage->lru);
+ list_add(&zspage->lru, &pool->lru);
+ }
+#endif
+
/*
- * migration cannot move any zpages in this zspage. Here, class->lock
+ * migration cannot move any zpages in this zspage. Here, pool->lock
* is too heavy since callers would take some time until they calls
* zs_unmap_object API so delegate the locking from class to zspage
* which is smaller granularity.
*/
migrate_read_lock(zspage);
- read_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = (class->size * obj_idx) & ~PAGE_MASK;
@@ -1433,8 +1527,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
- /* class->lock effectively protects the zpage migration */
- spin_lock(&class->lock);
+ /* pool->lock effectively protects the zpage migration */
+ spin_lock(&pool->lock);
zspage = find_get_zspage(class);
if (likely(zspage)) {
obj = obj_malloc(pool, zspage, handle);
@@ -1442,12 +1536,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, OBJ_USED, 1);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
return handle;
}
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
@@ -1455,7 +1549,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
return (unsigned long)ERR_PTR(-ENOMEM);
}
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
obj = obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
@@ -1468,7 +1562,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
return handle;
}
@@ -1512,26 +1606,38 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
return;
/*
- * The pool->migrate_lock protects the race with zpage's migration
+ * The pool->lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- read_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_page(obj, &f_page);
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- spin_lock(&class->lock);
- read_unlock(&pool->migrate_lock);
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1);
+
+#ifdef CONFIG_ZPOOL
+ if (zspage->under_reclaim) {
+ /*
+ * Reclaim needs the handles during writeback. It'll free
+ * them along with the zspage when it's done with them.
+ *
+ * Record current deferred handle at the memory location
+ * whose address is given by handle.
+ */
+ record_obj(handle, (unsigned long)zspage->deferred_handles);
+ zspage->deferred_handles = (unsigned long *)handle;
+ spin_unlock(&pool->lock);
+ return;
+ }
+#endif
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY)
- goto out;
+ if (fullness == ZS_EMPTY)
+ free_zspage(pool, class, zspage);
- free_zspage(pool, class, zspage);
-out:
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1730,7 +1836,7 @@ static enum fullness_group putback_zspage(struct size_class *class,
return fullness;
}
-#ifdef CONFIG_COMPACTION
+#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION)
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1772,6 +1878,24 @@ static void lock_zspage(struct zspage *zspage)
}
migrate_read_unlock(zspage);
}
+#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */
+
+#ifdef CONFIG_ZPOOL
+/*
+ * Unlocks all the pages of the zspage.
+ *
+ * pool->lock must be held before this function is called
+ * to prevent the underlying pages from migrating.
+ */
+static void unlock_zspage(struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+#endif /* CONFIG_ZPOOL */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -1788,6 +1912,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
read_unlock(&zspage->lock);
}
+#ifdef CONFIG_COMPACTION
static void migrate_write_lock(struct zspage *zspage)
{
write_lock(&zspage->lock);
@@ -1888,16 +2013,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
pool = zspage->pool;
/*
- * The pool migrate_lock protects the race between zpage migration
+ * The pool's lock protects the race between zpage migration
* and zs_free.
*/
- write_lock(&pool->migrate_lock);
+ spin_lock(&pool->lock);
class = zspage_class(pool, zspage);
- /*
- * the class lock protects zpage alloc/free in the zspage.
- */
- spin_lock(&class->lock);
/* the migrate_write_lock protects zpage access via zs_map_object */
migrate_write_lock(zspage);
@@ -1927,10 +2048,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
replace_sub_page(class, zspage, newpage, page);
/*
* Since we complete the data copy and set up new zspage structure,
- * it's okay to release migration_lock.
+ * it's okay to release the pool's lock.
*/
- write_unlock(&pool->migrate_lock);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
dec_zspage_isolation(zspage);
migrate_write_unlock(zspage);
@@ -1985,9 +2105,9 @@ static void async_free_zspage(struct work_struct *work)
if (class->index != i)
continue;
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
}
list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
@@ -1997,9 +2117,12 @@ static void async_free_zspage(struct work_struct *work)
get_zspage_mapping(zspage, &class_idx, &fullness);
VM_BUG_ON(fullness != ZS_EMPTY);
class = pool->size_class[class_idx];
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
- spin_unlock(&class->lock);
+ spin_unlock(&pool->lock);
}
};
@@ -2060,10 +2183,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
- /* protect the race between zpage migration and zs_free */
- write_lock(&pool->migrate_lock);
- /* protect zpage allocation/free */
- spin_lock(&class->lock);
+ /*
+ * protect the race between zpage migration and zs_free
+ * as well as zpage allocation/free
+ */
+ spin_lock(&pool->lock);
while ((src_zspage = isolate_zspage(class, true))) {
/* protect someone accessing the zspage(i.e., zs_map_object) */
migrate_write_lock(src_zspage);
@@ -2088,7 +2212,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(class, dst_zspage);
migrate_write_unlock(dst_zspage);
dst_zspage = NULL;
- if (rwlock_is_contended(&pool->migrate_lock))
+ if (spin_is_contended(&pool->lock))
break;
}
@@ -2105,11 +2229,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
pages_freed += class->pages_per_zspage;
} else
migrate_write_unlock(src_zspage);
- spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
cond_resched();
- write_lock(&pool->migrate_lock);
- spin_lock(&class->lock);
+ spin_lock(&pool->lock);
}
if (src_zspage) {
@@ -2117,8 +2239,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_unlock(src_zspage);
}
- spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ spin_unlock(&pool->lock);
return pages_freed;
}
@@ -2221,7 +2342,7 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- rwlock_init(&pool->migrate_lock);
+ spin_lock_init(&pool->lock);
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
@@ -2292,7 +2413,6 @@ struct zs_pool *zs_create_pool(const char *name)
class->index = i;
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
- spin_lock_init(&class->lock);
pool->size_class[i] = class;
for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
fullness++)
@@ -2312,6 +2432,10 @@ struct zs_pool *zs_create_pool(const char *name)
*/
zs_register_shrinker(pool);
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&pool->lru);
+#endif
+
return pool;
err:
@@ -2353,6 +2477,100 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+#ifdef CONFIG_ZPOOL
+static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
+{
+ int i, obj_idx, ret = 0;
+ unsigned long handle;
+ struct zspage *zspage;
+ struct page *page;
+ enum fullness_group fullness;
+
+ /* Lock LRU and fullness list */
+ spin_lock(&pool->lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < retries; i++) {
+ struct size_class *class;
+
+ zspage = list_last_entry(&pool->lru, struct zspage, lru);
+ list_del(&zspage->lru);
+
+ /* zs_free may free objects, but not the zspage and handles */
+ zspage->under_reclaim = true;
+
+ class = zspage_class(pool, zspage);
+ fullness = get_fullness_group(class, zspage);
+
+ /* Lock out object allocations and object compaction */
+ remove_zspage(class, zspage, fullness);
+
+ spin_unlock(&pool->lock);
+ cond_resched();
+
+ /* Lock backing pages into place */
+ lock_zspage(zspage);
+
+ obj_idx = 0;
+ page = get_first_page(zspage);
+ while (1) {
+ handle = find_alloced_obj(class, page, &obj_idx);
+ if (!handle) {
+ page = get_next_page(page);
+ if (!page)
+ break;
+ obj_idx = 0;
+ continue;
+ }
+
+ /*
+ * This will write the object and call zs_free.
+ *
+ * zs_free will free the object, but the
+ * under_reclaim flag prevents it from freeing
+ * the zspage altogether. This is necessary so
+ * that we can continue working with the
+ * zspage potentially after the last object
+ * has been freed.
+ */
+ ret = pool->zpool_ops->evict(pool->zpool, handle);
+ if (ret)
+ goto next;
+
+ obj_idx++;
+ }
+
+next:
+ /* For freeing the zspage, or putting it back in the pool and LRU list. */
+ spin_lock(&pool->lock);
+ zspage->under_reclaim = false;
+
+ if (!get_zspage_inuse(zspage)) {
+ /*
+ * Fullness went stale as zs_free() won't touch it
+ * while the page is removed from the pool. Fix it
+ * up for the check in __free_zspage().
+ */
+ zspage->fullness = ZS_EMPTY;
+
+ __free_zspage(pool, class, zspage);
+ spin_unlock(&pool->lock);
+ return 0;
+ }
+
+ putback_zspage(class, zspage);
+ list_add(&zspage->lru, &pool->lru);
+ unlock_zspage(zspage);
+ }
+
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+#endif /* CONFIG_ZPOOL */
+
static int __init zs_init(void)
{
int ret;
diff --git a/mm/zswap.c b/mm/zswap.c
index 3019f0bde194..f6c89049cf70 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
swpentry = zhdr->swpentry; /* here */
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
+ zpool_unmap_handle(pool, handle);
/* find and ref zswap entry */
spin_lock(&tree->lock);
@@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
- zpool_unmap_handle(pool, handle);
kfree(tmp);
return 0;
}
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
- src = (u8 *)zhdr + sizeof(struct zswap_header);
- if (!zpool_can_sleep_mapped(pool)) {
- memcpy(tmp, src, entry->length);
- src = tmp;
- zpool_unmap_handle(pool, handle);
- }
-
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
+ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, handle);
+ }
+
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
@@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
dlen = acomp_ctx->req->dlen;
mutex_unlock(acomp_ctx->mutex);
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
+ else
+ zpool_unmap_handle(pool, handle);
+
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- goto end;
+ return ret;
+
+fail:
+ if (!zpool_can_sleep_mapped(pool))
+ kfree(tmp);
/*
* if we get here due to ZSWAP_SWAPCACHE_EXIST
@@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
* if we free the entry in the following put
* it is also okay to return !0
*/
-fail:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
-end:
- if (zpool_can_sleep_mapped(pool))
- zpool_unmap_handle(pool, handle);
- else
- kfree(tmp);
-
return ret;
}
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 2e91973fbaa6..81fa7ec2e66a 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * maple_tree.c: Userspace shim for maple tree test-suite
- * Copyright (c) 2018 Liam R. Howlett <Liam.Howlett@Oracle.com>
+ * maple_tree.c: Userspace testing for maple tree test-suite
+ * Copyright (c) 2018-2022 Oracle Corporation
+ * Author: Liam R. Howlett <Liam.Howlett@Oracle.com>
*
* Any tests that require internal knowledge of the tree or threads and other
* difficult to handle in kernel tests.
diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile
index 838a8e49f77b..b71247ba7196 100644
--- a/tools/testing/selftests/damon/Makefile
+++ b/tools/testing/selftests/damon/Makefile
@@ -8,7 +8,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh
TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh
TEST_PROGS += debugfs_duplicate_context_creation.sh
TEST_PROGS += debugfs_rm_non_contexts.sh
-TEST_PROGS += sysfs.sh
+TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh
TEST_PROGS += reclaim.sh lru_sort.sh
include ../lib.mk
diff --git a/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh
new file mode 100644
index 000000000000..ade35576e748
--- /dev/null
+++ b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $EUID -ne 0 ]
+then
+ echo "Run as root"
+ exit $ksft_skip
+fi
+
+damon_sysfs="/sys/kernel/mm/damon/admin"
+if [ ! -d "$damon_sysfs" ]
+then
+ echo "damon sysfs not found"
+ exit $ksft_skip
+fi
+
+# clear log
+dmesg -C
+
+# start DAMON with a scheme
+echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts"
+echo "vaddr" > "$damon_sysfs/kdamonds/0/contexts/0/operations"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/targets/nr_targets"
+echo $$ > "$damon_sysfs/kdamonds/0/contexts/0/targets/0/pid_target"
+echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes"
+scheme_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0"
+echo 4096000 > "$scheme_dir/access_pattern/sz/max"
+echo 20 > "$scheme_dir/access_pattern/nr_accesses/max"
+echo 1024 > "$scheme_dir/access_pattern/age/max"
+echo "on" > "$damon_sysfs/kdamonds/0/state"
+sleep 0.3
+
+# remove scheme sysfs dir
+echo 0 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes"
+
+# try to update stat of already removed scheme sysfs dir
+echo "update_schemes_stats" > "$damon_sysfs/kdamonds/0/state"
+if dmesg | grep -q BUG
+then
+ echo "update_schemes_stats triggers a kernel bug"
+ dmesg
+ exit 1
+fi
+
+# try to update tried regions of already removed scheme sysfs dir
+echo "update_schemes_tried_regions" > "$damon_sysfs/kdamonds/0/state"
+if dmesg | grep -q BUG
+then
+ echo "update_schemes_tried_regions triggers a kernel bug"
+ dmesg
+ exit 1
+fi
+
+echo "off" > "$damon_sysfs/kdamonds/0/state"
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index a4d764efd6e3..89c14e41bd43 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -55,6 +55,7 @@ TEST_GEN_FILES += userfaultfd
TEST_GEN_PROGS += soft-dirty
TEST_GEN_PROGS += split_huge_page_test
TEST_GEN_FILES += ksm_tests
+TEST_GEN_PROGS += ksm_functional_tests
ifeq ($(MACHINE),x86_64)
CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
@@ -100,6 +101,7 @@ include ../lib.mk
$(OUTPUT)/cow: vm_util.c
$(OUTPUT)/khugepaged: vm_util.c
+$(OUTPUT)/ksm_functional_tests: vm_util.c
$(OUTPUT)/madv_populate: vm_util.c
$(OUTPUT)/soft-dirty: vm_util.c
$(OUTPUT)/split_huge_page_test: vm_util.c
diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
new file mode 100644
index 000000000000..96644be68962
--- /dev/null
+++ b/tools/testing/selftests/vm/ksm_functional_tests.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KSM functional tests
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define KiB 1024u
+#define MiB (1024 * KiB)
+
+static int ksm_fd;
+static int ksm_full_scans_fd;
+static int pagemap_fd;
+static size_t pagesize;
+
+static bool range_maps_duplicates(char *addr, unsigned long size)
+{
+ unsigned long offs_a, offs_b, pfn_a, pfn_b;
+
+ /*
+ * There is no easy way to check if there are KSM pages mapped into
+ * this range. We only check that the range does not map the same PFN
+ * twice by comaring each pair of mapped pages.
+ */
+ for (offs_a = 0; offs_a < size; offs_a += pagesize) {
+ pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
+ /* Page not present or PFN not exposed by the kernel. */
+ if (pfn_a == -1ull || !pfn_a)
+ continue;
+
+ for (offs_b = offs_a + pagesize; offs_b < size;
+ offs_b += pagesize) {
+ pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
+ if (pfn_b == -1ull || !pfn_b)
+ continue;
+ if (pfn_a == pfn_b)
+ return true;
+ }
+ }
+ return false;
+}
+
+static long ksm_get_full_scans(void)
+{
+ char buf[10];
+ ssize_t ret;
+
+ ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+ if (ret <= 0)
+ return -errno;
+ buf[ret] = 0;
+
+ return strtol(buf, NULL, 10);
+}
+
+static int ksm_merge(void)
+{
+ long start_scans, end_scans;
+
+ /* Wait for two full scans such that any possible merging happened. */
+ start_scans = ksm_get_full_scans();
+ if (start_scans < 0)
+ return start_scans;
+ if (write(ksm_fd, "1", 1) != 1)
+ return -errno;
+ do {
+ end_scans = ksm_get_full_scans();
+ if (end_scans < 0)
+ return end_scans;
+ } while (end_scans < start_scans + 2);
+
+ return 0;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size)
+{
+ char *map;
+
+ map = mmap(NULL, size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANON, -1, 0);
+ if (map == MAP_FAILED) {
+ ksft_test_result_fail("mmap() failed\n");
+ return MAP_FAILED;
+ }
+
+ /* Don't use THP. Ignore if THP are not around on a kernel. */
+ if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
+ ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ goto unmap;
+ }
+
+ /* Make sure each page contains the same values to merge them. */
+ memset(map, val, size);
+ if (madvise(map, size, MADV_MERGEABLE)) {
+ ksft_test_result_fail("MADV_MERGEABLE failed\n");
+ goto unmap;
+ }
+
+ /* Run KSM to trigger merging and wait. */
+ if (ksm_merge()) {
+ ksft_test_result_fail("Running KSM failed\n");
+ goto unmap;
+ }
+ return map;
+unmap:
+ munmap(map, size);
+ return MAP_FAILED;
+}
+
+static void test_unmerge(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+static void test_unmerge_discarded(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ /* Discard half of all mapped pages so we have pte_none() entries. */
+ if (madvise(map, size / 2, MADV_DONTNEED)) {
+ ksft_test_result_fail("MADV_DONTNEED failed\n");
+ goto unmap;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_unmerge_uffd_wp(void)
+{
+ struct uffdio_writeprotect uffd_writeprotect;
+ struct uffdio_register uffdio_register;
+ const unsigned int size = 2 * MiB;
+ struct uffdio_api uffdio_api;
+ char *map;
+ int uffd;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size);
+ if (map == MAP_FAILED)
+ return;
+
+ /* See if UFFD is around. */
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ ksft_test_result_skip("__NR_userfaultfd failed\n");
+ goto unmap;
+ }
+
+ /* See if UFFD-WP is around. */
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+ ksft_test_result_fail("UFFDIO_API failed\n");
+ goto close_uffd;
+ }
+ if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
+ ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
+ goto close_uffd;
+ }
+
+ /* Register UFFD-WP, no need for an actual handler. */
+ uffdio_register.range.start = (unsigned long) map;
+ uffdio_register.range.len = size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
+ ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
+ goto close_uffd;
+ }
+
+ /* Write-protect the range using UFFD-WP. */
+ uffd_writeprotect.range.start = (unsigned long) map;
+ uffd_writeprotect.range.len = size;
+ uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+ if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+ ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
+ goto close_uffd;
+ }
+
+ if (madvise(map, size, MADV_UNMERGEABLE)) {
+ ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+ goto close_uffd;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+close_uffd:
+ close(uffd);
+unmap:
+ munmap(map, size);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ unsigned int tests = 2;
+ int err;
+
+#ifdef __NR_userfaultfd
+ tests++;
+#endif
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ pagesize = getpagesize();
+
+ ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+ if (ksm_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
+ ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+ if (ksm_full_scans_fd < 0)
+ ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+
+ test_unmerge();
+ test_unmerge_discarded();
+#ifdef __NR_userfaultfd
+ test_unmerge_uffd_wp();
+#endif
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d tests failed\n",
+ err, ksft_test_num());
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
index 0d85be2350fa..f9eb4d67e0dd 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -40,6 +40,7 @@ enum ksm_test_name {
CHECK_KSM_NUMA_MERGE,
KSM_MERGE_TIME,
KSM_MERGE_TIME_HUGE_PAGES,
+ KSM_UNMERGE_TIME,
KSM_COW_TIME
};
@@ -108,7 +109,10 @@ static void print_help(void)
" -P evaluate merging time and speed.\n"
" For this test, the size of duplicated memory area (in MiB)\n"
" must be provided using -s option\n"
- " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+ " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+ " For this test, the size of duplicated memory area (in MiB)\n"
+ " must be provided using -s option\n"
+ " -D evaluate unmerging time and speed when disabling KSM.\n"
" For this test, the size of duplicated memory area (in MiB)\n"
" must be provided using -s option\n"
" -C evaluate the time required to break COW of merged pages.\n\n");
@@ -188,6 +192,16 @@ static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time,
return 0;
}
+static int ksm_unmerge_pages(void *addr, size_t size,
+ struct timespec start_time, int timeout)
+{
+ if (madvise(addr, size, MADV_UNMERGEABLE)) {
+ perror("madvise");
+ return 1;
+ }
+ return 0;
+}
+
static bool assert_ksm_pages_count(long dupl_page_count)
{
unsigned long max_page_sharing, pages_sharing, pages_shared;
@@ -560,6 +574,53 @@ err_out:
return KSFT_FAIL;
}
+static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+ void *map_ptr;
+ struct timespec start_time, end_time;
+ unsigned long scan_time_ns;
+
+ map_size *= MB;
+
+ map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+ if (!map_ptr)
+ return KSFT_FAIL;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+ if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
+ goto err_out;
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+ perror("clock_gettime");
+ goto err_out;
+ }
+
+ scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ printf("Total size: %lu MiB\n", map_size / MB);
+ printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+ scan_time_ns % NSEC_PER_SEC);
+ printf("Average speed: %.3f MiB/s\n", (map_size / MB) /
+ ((double)scan_time_ns / NSEC_PER_SEC));
+
+ munmap(map_ptr, map_size);
+ return KSFT_PASS;
+
+err_out:
+ printf("Not OK\n");
+ munmap(map_ptr, map_size);
+ return KSFT_FAIL;
+}
+
static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
{
void *map_ptr;
@@ -644,7 +705,7 @@ int main(int argc, char *argv[])
bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
long size_MB = 0;
- while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) {
+ while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
switch (opt) {
case 'a':
prot = str_to_prot(optarg);
@@ -701,6 +762,9 @@ int main(int argc, char *argv[])
case 'H':
test_name = KSM_MERGE_TIME_HUGE_PAGES;
break;
+ case 'D':
+ test_name = KSM_UNMERGE_TIME;
+ break;
case 'C':
test_name = KSM_COW_TIME;
break;
@@ -762,6 +826,14 @@ int main(int argc, char *argv[])
ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
ksm_scan_limit_sec, size_MB);
break;
+ case KSM_UNMERGE_TIME:
+ if (size_MB == 0) {
+ printf("Option '-s' is required.\n");
+ return KSFT_FAIL;
+ }
+ ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+ ksm_scan_limit_sec, size_MB);
+ break;
case KSM_COW_TIME:
ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
page_size);
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
index 54d7a822c2ce..8984e0bb58c7 100755
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -1,13 +1,88 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
-#please run as root
+# Please run as root
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
exitcode=0
-#get huge pagesize and freepages from /proc/meminfo
+usage() {
+ cat <<EOF
+usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
+ -t: specify specific categories to tests to run
+ -h: display this message
+
+The default behavior is to run all tests.
+
+Alternatively, specific groups tests can be run by passing a string
+to the -t argument containing one or more of the following categories
+separated by spaces:
+- mmap
+ tests for mmap(2)
+- gup_test
+ tests for gup using gup_test interface
+- userfaultfd
+ tests for userfaultfd(2)
+- compaction
+ a test for the patch "Allow compaction of unevictable pages"
+- mlock
+ tests for mlock(2)
+- mremap
+ tests for mremap(2)
+- hugevm
+ tests for very large virtual address space
+- vmalloc
+ vmalloc smoke tests
+- hmm
+ hmm smoke tests
+- madv_populate
+ test memadvise(2) MADV_POPULATE_{READ,WRITE} options
+- memfd_secret
+ test memfd_secret(2)
+- process_mrelease
+ test process_mrelease(2)
+- ksm
+ ksm tests that do not require >=2 NUMA nodes
+- ksm_numa
+ ksm tests that require >=2 NUMA nodes
+- pkey
+ memory protection key tests
+- soft_dirty
+ test soft dirty page bit semantics
+- cow
+ test copy-on-write semantics
+example: ./run_vmtests.sh -t "hmm mmap ksm"
+EOF
+ exit 0
+}
+
+
+while getopts "ht:" OPT; do
+ case ${OPT} in
+ "h") usage ;;
+ "t") VM_SELFTEST_ITEMS=${OPTARG} ;;
+ esac
+done
+shift $((OPTIND -1))
+
+# default behavior: run all tests
+VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
+
+test_selected() {
+ if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
+ # If no VM_SELFTEST_ITEMS are specified, run all tests
+ return 0
+ fi
+ # If test selected argument is one of the test items
+ if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+# get huge pagesize and freepages from /proc/meminfo
while read -r name size unit; do
if [ "$name" = "HugePages_Free:" ]; then
freepgs="$size"
@@ -27,7 +102,7 @@ hpgsize_MB=$((hpgsize_KB / 1024))
half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
needmem_KB=$((half_ufd_size_MB * 2 * 1024))
-#set proper nr_hugepages
+# set proper nr_hugepages
if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
needpgs=$((needmem_KB / hpgsize_KB))
@@ -56,136 +131,144 @@ else
exit 1
fi
-#filter 64bit architectures
+# filter 64bit architectures
ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
if [ -z "$ARCH" ]; then
ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
fi
VADDR64=0
-echo "$ARCH64STR" | grep "$ARCH" && VADDR64=1
+echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
# Usage: run_test [test binary] [arbitrary test arguments...]
run_test() {
- local title="running $*"
- local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
- printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
-
- "$@"
- local ret=$?
- if [ $ret -eq 0 ]; then
- echo "[PASS]"
- elif [ $ret -eq $ksft_skip ]; then
- echo "[SKIP]"
- exitcode=$ksft_skip
- else
- echo "[FAIL]"
- exitcode=1
- fi
+ if test_selected ${CATEGORY}; then
+ local title="running $*"
+ local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
+ printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
+
+ "$@"
+ local ret=$?
+ if [ $ret -eq 0 ]; then
+ echo "[PASS]"
+ elif [ $ret -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+ else
+ echo "[FAIL]"
+ exitcode=1
+ fi
+ fi # test_selected
}
-run_test ./hugepage-mmap
+CATEGORY="hugetlb" run_test ./hugepage-mmap
shmmax=$(cat /proc/sys/kernel/shmmax)
shmall=$(cat /proc/sys/kernel/shmall)
echo 268435456 > /proc/sys/kernel/shmmax
echo 4194304 > /proc/sys/kernel/shmall
-run_test ./hugepage-shm
+CATEGORY="hugetlb" run_test ./hugepage-shm
echo "$shmmax" > /proc/sys/kernel/shmmax
echo "$shmall" > /proc/sys/kernel/shmall
-run_test ./map_hugetlb
-run_test ./hugepage-mremap
-run_test ./hugepage-vmemmap
-run_test ./hugetlb-madvise
+CATEGORY="hugetlb" run_test ./map_hugetlb
+CATEGORY="hugetlb" run_test ./hugepage-mremap
+CATEGORY="hugetlb" run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./hugetlb-madvise
-echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
-echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
-echo " hugetlb regression testing."
+if test_selected "hugetlb"; then
+ echo "NOTE: These hugetlb tests provide minimal coverage. Use"
+ echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+ echo " hugetlb regression testing."
+fi
-run_test ./map_fixed_noreplace
+CATEGORY="mmap" run_test ./map_fixed_noreplace
# get_user_pages_fast() benchmark
-run_test ./gup_test -u
+CATEGORY="gup_test" run_test ./gup_test -u
# pin_user_pages_fast() benchmark
-run_test ./gup_test -a
+CATEGORY="gup_test" run_test ./gup_test -a
# Dump pages 0, 19, and 4096, using pin_user_pages:
-run_test ./gup_test -ct -F 0x1 0 19 0x1000
+CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
uffd_mods=("" ":dev")
for mod in "${uffd_mods[@]}"; do
- run_test ./userfaultfd anon${mod} 20 16
+ CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
# Hugetlb tests require source and destination huge pages. Pass in half
# the size ($half_ufd_size_MB), which is used for *each*.
- run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
- run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
- run_test ./userfaultfd shmem${mod} 20 16
+ CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+ CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
+ CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
done
#cleanup
echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
-run_test ./compaction_test
+CATEGORY="compaction" run_test ./compaction_test
-run_test sudo -u nobody ./on-fault-limit
+CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
-run_test ./map_populate
+CATEGORY="mmap" run_test ./map_populate
-run_test ./mlock-random-test
+CATEGORY="mlock" run_test ./mlock-random-test
-run_test ./mlock2-tests
+CATEGORY="mlock" run_test ./mlock2-tests
-run_test ./mrelease_test
+CATEGORY="process_mrelease" run_test ./mrelease_test
-run_test ./mremap_test
+CATEGORY="mremap" run_test ./mremap_test
-run_test ./thuge-gen
+CATEGORY="hugetlb" run_test ./thuge-gen
if [ $VADDR64 -ne 0 ]; then
- run_test ./virtual_address_range
+ CATEGORY="hugevm" run_test ./virtual_address_range
# virtual address 128TB switch test
- run_test ./va_128TBswitch.sh
+ CATEGORY="hugevm" run_test ./va_128TBswitch.sh
fi # VADDR64
# vmalloc stability smoke test
-run_test ./test_vmalloc.sh smoke
+CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
-run_test ./mremap_dontunmap
+CATEGORY="mremap" run_test ./mremap_dontunmap
-run_test ./test_hmm.sh smoke
+CATEGORY="hmm" run_test ./test_hmm.sh smoke
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
-run_test ./madv_populate
+CATEGORY="madv_populate" run_test ./madv_populate
-run_test ./memfd_secret
+CATEGORY="memfd_secret" run_test ./memfd_secret
# KSM MADV_MERGEABLE test with 10 identical pages
-run_test ./ksm_tests -M -p 10
+CATEGORY="ksm" run_test ./ksm_tests -M -p 10
# KSM unmerge test
-run_test ./ksm_tests -U
+CATEGORY="ksm" run_test ./ksm_tests -U
# KSM test with 10 zero pages and use_zero_pages = 0
-run_test ./ksm_tests -Z -p 10 -z 0
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
# KSM test with 10 zero pages and use_zero_pages = 1
-run_test ./ksm_tests -Z -p 10 -z 1
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
# KSM test with 2 NUMA nodes and merge_across_nodes = 1
-run_test ./ksm_tests -N -m 1
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
# KSM test with 2 NUMA nodes and merge_across_nodes = 0
-run_test ./ksm_tests -N -m 0
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
+
+CATEGORY="ksm" run_test ./ksm_functional_tests
+
+run_test ./ksm_functional_tests
# protection_keys tests
if [ -x ./protection_keys_32 ]
then
- run_test ./protection_keys_32
+ CATEGORY="pkey" run_test ./protection_keys_32
fi
if [ -x ./protection_keys_64 ]
then
- run_test ./protection_keys_64
+ CATEGORY="pkey" run_test ./protection_keys_64
fi
-run_test ./soft-dirty
+CATEGORY="soft_dirty" run_test ./soft-dirty
-# COW tests for anonymous memory
-run_test ./cow
+# COW tests
+CATEGORY="cow" run_test ./cow
exit $exitcode
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
index 5bbf7641a0f0..710571902743 100644
--- a/tools/testing/selftests/vm/vm_util.c
+++ b/tools/testing/selftests/vm/vm_util.c
@@ -43,6 +43,16 @@ bool pagemap_is_populated(int fd, char *start)
return entry & 0xc000000000000000ull;
}
+unsigned long pagemap_get_pfn(int fd, char *start)
+{
+ uint64_t entry = pagemap_get_entry(fd, start);
+
+ /* If present (63th bit), PFN is at bit 0 -- 54. */
+ if (entry & 0x8000000000000000ull)
+ return entry & 0x007fffffffffffffull;
+ return -1ull;
+}
+
void clear_softdirty(void)
{
int ret;
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
index 80d5a6ad413b..1995ee911ef2 100644
--- a/tools/testing/selftests/vm/vm_util.h
+++ b/tools/testing/selftests/vm/vm_util.h
@@ -6,6 +6,7 @@ uint64_t pagemap_get_entry(int fd, char *start);
bool pagemap_is_softdirty(int fd, char *start);
bool pagemap_is_swapped(int fd, char *start);
bool pagemap_is_populated(int fd, char *start);
+unsigned long pagemap_get_pfn(int fd, char *start);
void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void);