summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-08-18 14:16:22 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2020-08-18 14:16:22 +1000
commitc9467412a10938a685af9561566d709af6a9f18c (patch)
tree5ed0af2a00499e75a41fc33ad86dfe18f385d1f9
parent804bd0188eb3284f2a873a0f3e45eeccb361f06f (diff)
parentf0b863ac0f643ae15e8463e773fdcb33c9dd6c3c (diff)
downloadlinux-c9467412a10938a685af9561566d709af6a9f18c.tar.gz
linux-c9467412a10938a685af9561566d709af6a9f18c.tar.xz
Merge branch 'akpm/master' into master
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/ia64/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--fs/io_uring.c2
-rw-r--r--include/linux/compat.h4
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/pid.h1
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--mm/madvise.c190
-rw-r--r--mm/memory-failure.c18
-rw-r--r--mm/memory_hotplug.c46
30 files changed, 245 insertions, 82 deletions
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 4affd5b6150c..24c4a4db5c2e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@
549 common faccessat2 sys_faccessat2
550 common watch_mount sys_watch_mount
551 common fsinfo sys_fsinfo
+552 common process_madvise sys_process_madvise
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index a68443d0a8ee..d345ab40f747 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -455,3 +455,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 86a9d7b3eabe..949788f5ba40 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 442
+#define __NR_compat_syscalls 443
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c10990736770..34a614cc0130 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
__SYSCALL(__NR_watch_mount, sys_watch_mount)
#define __NR_fsinfo 441
__SYSCALL(__NR_fsinfo, sys_fsinfo)
+#define __NR_process_madvise 442
+__SYSCALL(__NR_process_madvise, compat_sys_process_madvise)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 8d1611dbffe4..437532a411b5 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -362,3 +362,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 079b85781168..04fbee8f5e17 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index e596b24209b3..1e05fce04016 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -447,3 +447,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 84ebfee47acb..a204f01c1c24 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -380,3 +380,4 @@
439 n32 faccessat2 sys_faccessat2
440 n32 watch_mount sys_watch_mount
441 n32 fsinfo sys_fsinfo
+442 n32 process_madvise compat_sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index ca88651df43a..0db6e3d540f3 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -356,3 +356,4 @@
439 n64 faccessat2 sys_faccessat2
440 n64 watch_mount sys_watch_mount
441 n64 fsinfo sys_fsinfo
+442 n64 process_madvise sys_process_madvise
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 02dddb858562..c531231a2383 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -429,3 +429,4 @@
439 o32 faccessat2 sys_faccessat2
440 o32 watch_mount sys_watch_mount
441 o32 fsinfo sys_fsinfo
+442 o32 process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index b7646de8ca8f..48090b11f1f2 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 3fa45ad7666b..56c9df0f539d 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -531,3 +531,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 60169f960745..88a91abc0bca 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
439 common faccessat2 sys_faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 6ff815773779..5de0035a9967 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -444,3 +444,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 93efb7eacad5..42e68b35bdb4 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -487,3 +487,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index cdac678ee140..9e0cdf55ca56 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -446,3 +446,4 @@
439 i386 faccessat2 sys_faccessat2
440 i386 watch_mount sys_watch_mount
441 i386 fsinfo sys_fsinfo
+442 i386 process_madvise sys_process_madvise compat_sys_process_madvise
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 06001131634b..6ee78ddd03d7 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 64 process_madvise sys_process_madvise
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -406,3 +407,4 @@
545 x32 execveat compat_sys_execveat
546 x32 preadv2 compat_sys_preadv64v2
547 x32 pwritev2 compat_sys_pwritev64v2
+548 x32 process_madvise compat_sys_process_madvise
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 709e8e4e2b58..f74cbc2fa7c8 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -412,3 +412,4 @@
439 common faccessat2 sys_faccessat2
440 common watch_mount sys_watch_mount
441 common fsinfo sys_fsinfo
+442 common process_madvise sys_process_madvise
diff --git a/fs/io_uring.c b/fs/io_uring.c
index dc506b75659c..93ad3e2a1e1f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3855,7 +3855,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
if (force_nonblock)
return -EAGAIN;
- ret = do_madvise(ma->addr, ma->len, ma->advice);
+ ret = do_madvise(NULL, current->mm, ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_req_complete(req, ret);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index d38c4d7e83bd..4f374e6a7ed8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -823,6 +823,10 @@ asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
unsigned long vlen, loff_t pos, rwf_t flags);
#endif
+asmlinkage ssize_t compat_sys_process_madvise(compat_int_t pidfd,
+ const struct compat_iovec __user *vec,
+ compat_ulong_t vlen, compat_int_t behavior,
+ compat_uint_t flags);
/*
* Deprecated system calls which are still defined in
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a911aa3114ef..8ab941cf73f4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2548,7 +2548,8 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
-extern int do_madvise(unsigned long start, size_t len_in, int behavior);
+extern int do_madvise(struct task_struct *target_task, struct mm_struct *mm,
+ unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 176d6cf80e7c..86e0e7cb7872 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops;
struct file;
extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_get_pid(unsigned int fd);
static inline struct pid *get_pid(struct pid *pid)
{
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8b9d12c7a447..88520eb343b8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -880,6 +880,8 @@ asmlinkage long sys_munlockall(void);
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec);
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
+asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
+ unsigned long vlen, int behavior, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fa3d6815466a..c0b5f8b609eb 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,9 +863,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2)
__SYSCALL(__NR_watch_mount, sys_watch_mount)
#define __NR_fsinfo 441
__SYSCALL(__NR_fsinfo, sys_fsinfo)
+#define __NR_process_madvise 442
+__SC_COMP(__NR_process_madvise, sys_process_madvise, compat_sys_process_madvise)
#undef __NR_syscalls
-#define __NR_syscalls 442
+#define __NR_syscalls 443
/*
* 32 bit systems traditionally used different
diff --git a/kernel/exit.c b/kernel/exit.c
index 733e80f334e7..62912406d74a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1474,23 +1474,6 @@ end:
return retval;
}
-static struct pid *pidfd_get_pid(unsigned int fd)
-{
- struct fd f;
- struct pid *pid;
-
- f = fdget(fd);
- if (!f.file)
- return ERR_PTR(-EBADF);
-
- pid = pidfd_pid(f.file);
- if (!IS_ERR(pid))
- get_pid(pid);
-
- fdput(f);
- return pid;
-}
-
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index b2562a7ce525..a9cbab0194d9 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -519,6 +519,23 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
+struct pid *pidfd_get_pid(unsigned int fd)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid))
+ get_pid(pid);
+
+ fdput(f);
+ return pid;
+}
+
/**
* pidfd_create() - Create a new pid file descriptor.
*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8a4ae67bb08b..b5a30f4ea460 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -284,6 +284,8 @@ COND_SYSCALL(mlockall);
COND_SYSCALL(munlockall);
COND_SYSCALL(mincore);
COND_SYSCALL(madvise);
+COND_SYSCALL(process_madvise);
+COND_SYSCALL_COMPAT(process_madvise);
COND_SYSCALL(remap_file_pages);
COND_SYSCALL(mbind);
COND_SYSCALL_COMPAT(mbind);
diff --git a/mm/madvise.c b/mm/madvise.c
index b59bbd6c9982..843f6fad3b89 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,17 +17,20 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compat.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <linux/sched/mm.h>
+#include <linux/uio.h>
#include <asm/tlb.h>
@@ -36,6 +39,7 @@
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
+ struct task_struct *target_task;
};
/*
@@ -255,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -289,12 +294,12 @@ static long madvise_willneed(struct vm_area_struct *vma,
*/
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
@@ -315,6 +320,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (fatal_signal_pending(current))
return -EINTR;
+ if (private->target_task &&
+ fatal_signal_pending(private->target_task))
+ return -EINTR;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
@@ -476,12 +485,14 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct task_struct *task,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
+ .target_task = task,
};
tlb_start_vma(tlb, vma);
@@ -489,7 +500,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
-static long madvise_cold(struct vm_area_struct *vma,
+static long madvise_cold(struct task_struct *task,
+ struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
@@ -502,19 +514,21 @@ static long madvise_cold(struct vm_area_struct *vma,
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, task, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct task_struct *task,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = true,
.tlb = tlb,
+ .target_task = task,
};
tlb_start_vma(tlb, vma);
@@ -538,7 +552,8 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
}
-static long madvise_pageout(struct vm_area_struct *vma,
+static long madvise_pageout(struct task_struct *task,
+ struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
@@ -554,7 +569,7 @@ static long madvise_pageout(struct vm_area_struct *vma,
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, task, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
@@ -683,7 +698,6 @@ out:
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
-
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
arch_leave_lazy_mmu_mode();
@@ -763,6 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -770,8 +786,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@@ -825,6 +841,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -852,13 +869,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return error;
}
@@ -925,7 +942,8 @@ static int madvise_inject_error(int behavior,
#endif
static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+madvise_vma(struct task_struct *task, struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
switch (behavior) {
@@ -934,9 +952,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(task, vma, prev, start, end);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(task, vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -983,6 +1001,18 @@ madvise_behavior_valid(int behavior)
}
}
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* The madvise(2) system call.
*
@@ -1030,6 +1060,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
*
* return values:
* zero - success
@@ -1044,7 +1079,8 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct task_struct *target_task, struct mm_struct *mm,
+ unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1082,7 +1118,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
/*
@@ -1097,12 +1133,12 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* but for now we have the mmget_still_valid()
* model.
*/
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
+ if (!mmget_still_valid(mm)) {
+ mmap_write_unlock(mm);
return -EINTR;
}
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
/*
@@ -1110,7 +1146,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@@ -1135,7 +1171,8 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(vma, &prev, start, tmp, behavior);
+ error = madvise_vma(target_task, vma, &prev,
+ start, tmp, behavior);
if (error)
goto out;
start = tmp;
@@ -1147,19 +1184,122 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
+ vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current, current->mm, start, len_in, behavior);
+}
+
+static int process_madvise_vec(struct task_struct *target_task,
+ struct mm_struct *mm, struct iov_iter *iter, int behavior)
+{
+ struct iovec iovec;
+ int ret = 0;
+
+ while (iov_iter_count(iter)) {
+ iovec = iov_iter_iovec(iter);
+ ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iovec.iov_len);
+ }
+
+ return ret;
+}
+
+static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter,
+ int behavior, unsigned int flags)
+{
+ ssize_t ret;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len = iov_iter_count(iter);
+
+ if (flags != 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(pidfd);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (task->mm != current->mm &&
+ !process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ ret = process_madvise_vec(task, mm, iter, behavior);
+ if (ret >= 0)
+ ret = total_len - iov_iter_count(iter);
+
+ mmput(mm);
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+ return ret;
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ unsigned long, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret >= 0) {
+ ret = do_process_madvise(pidfd, &iter, behavior, flags);
+ kfree(iov);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(process_madvise, compat_int_t, pidfd,
+ const struct compat_iovec __user *, vec,
+ compat_ulong_t, vlen,
+ compat_int_t, behavior,
+ compat_uint_t, flags)
+
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+
+ ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret >= 0) {
+ ret = do_process_madvise(pidfd, &iter, behavior, flags);
+ kfree(iov);
+ }
+ return ret;
}
+#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d979a837d84b..a229d4694954 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1673,16 +1673,6 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
-{
- struct migration_target_control mtc = {
- .nid = page_to_nid(p),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- return alloc_migration_target(p, (unsigned long)&mtc);
-}
-
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@@ -1794,6 +1784,10 @@ static int __soft_offline_page(struct page *page)
char const *msg_page[] = {"page", "hugepage"};
bool huge = PageHuge(page);
LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1829,8 +1823,8 @@ static int __soft_offline_page(struct page *page)
}
if (isolate_page(hpage, &pagelist)) {
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (!ret) {
bool release = !huge;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e9d5ab5d3ca0..7f62d69660e0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1274,27 +1274,6 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- nodemask_t nmask = node_states[N_MEMORY];
- struct migration_target_control mtc = {
- .nid = page_to_nid(page),
- .nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(mtc.nid, nmask);
- if (nodes_empty(nmask))
- node_set(mtc.nid, nmask);
-
- return alloc_migration_target(page, (unsigned long)&mtc);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1354,9 +1333,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",