diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2021-05-21 15:54:39 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2021-05-21 15:54:39 +1000 |
commit | a219796026e98e9a658d4e4fdb89a6b5c91daf42 (patch) | |
tree | 727a483f7c3f750e15d6a30b77c5a266102c34e5 | |
parent | 64aeacd14c46e1718ed728b2adfbc23f3e6dccae (diff) | |
parent | 8b6efe1906ea0f4d55b547c7e55e5489ad8decdc (diff) | |
download | linux-a219796026e98e9a658d4e4fdb89a6b5c91daf42.tar.gz linux-a219796026e98e9a658d4e4fdb89a6b5c91daf42.tar.xz |
Merge branch 'akpm/master'
49 files changed, 1096 insertions, 167 deletions
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 1dd4ebf0be79..a68730e7cfac 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -126,6 +126,17 @@ used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-calls are used and marked with the noreturn GCC attribute. +If the pointer is within a module, the module name and optionally build ID is +printed after the symbol name with an extra ``b`` appended to the end of the +specifier. + +:: + %pS versatile_init+0x0/0x110 [module_name] + %pSb versatile_init+0x0/0x110 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e] + %pSRb versatile_init+0x9/0x110 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e] + (with __builtin_extract_return_addr() translation) + %pBb prev_fn_of_versatile_init+0x88/0x88 [module_name ed5019fdf5e53be37cb1ba7899292d7e143b259e] + Probed Pointers from BPF / tracing ---------------------------------- diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 26889dbfe904..64202010b700 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -3,7 +3,6 @@ generic-y += early_ioremap.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += set_memory.h generic-y += user.h generated-y += cpucaps.h diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 52e5c1623224..4e3c13799735 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -131,12 +131,6 @@ static __always_inline void __flush_icache_all(void) dsb(ish); } -int set_memory_valid(unsigned long addr, int numpages, int enable); - -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -bool kernel_page_present(struct page *page); - #include <asm-generic/cacheflush.h> #endif /* __ASM_CACHEFLUSH_H */ diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index d061176d57ea..aa855c6a0ae6 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -8,7 +8,7 @@ #ifndef __ASM_KFENCE_H #define __ASM_KFENCE_H -#include <asm/cacheflush.h> +#include <asm/set_memory.h> static inline bool arch_kfence_init_pool(void) { return true; } diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h new file mode 100644 index 000000000000..0f740b781187 --- /dev/null +++ b/arch/arm64/include/asm/set_memory.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ASM_ARM64_SET_MEMORY_H +#define _ASM_ARM64_SET_MEMORY_H + +#include <asm-generic/set_memory.h> + +bool can_set_direct_map(void); +#define can_set_direct_map can_set_direct_map + +int set_memory_valid(unsigned long addr, int numpages, int enable); + +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); +bool kernel_page_present(struct page *page); + +#endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index f83a70e07df8..ce2ee8f1e361 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -20,5 +20,6 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS #define __ARCH_WANT_SYS_CLONE3 +#define __ARCH_WANT_MEMFD_SECRET #include <asm-generic/unistd.h> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index 90a335c74442..0ec94e718724 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/kexec.h> #include <linux/page-flags.h> +#include <linux/set_memory.h> #include <linux/smp.h> #include <asm/cacheflush.h> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index de07147a7926..073eca63488f 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -155,7 +155,7 @@ NOKPROBE_SYMBOL(walk_stackframe); static void dump_backtrace_entry(unsigned long where, const char *loglvl) { - printk("%s %pS\n", loglvl, (void *)where); + printk("%s %pSb\n", loglvl, (void *)where); } void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 550221f3c178..e3b639e2461d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/vmalloc.h> +#include <linux/set_memory.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -515,7 +516,7 @@ static void __init map_mem(pgd_t *pgdp) */ BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end)); - if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) + if (can_set_direct_map() || crash_mem_map) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -1487,8 +1488,7 @@ int arch_add_memory(int nid, u64 start, u64 size, * KFENCE requires linear map to be mapped at page granularity, so that * it is possible to protect/unprotect single pages in the KFENCE pool. */ - if (rodata_full || debug_pagealloc_enabled() || - IS_ENABLED(CONFIG_KFENCE)) + if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE)) flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 92eccaf595c8..a3bacd79507a 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -19,6 +19,11 @@ struct page_change_data { bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +bool can_set_direct_map(void) +{ + return rodata_full || debug_pagealloc_enabled(); +} + static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; @@ -155,7 +160,7 @@ int set_direct_map_invalid_noflush(struct page *page) .clear_mask = __pgprot(PTE_VALID), }; - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return 0; return apply_to_page_range(&init_mm, @@ -170,7 +175,7 @@ int set_direct_map_default_noflush(struct page *page) .clear_mask = __pgprot(PTE_RDONLY), }; - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return 0; return apply_to_page_range(&init_mm, @@ -181,7 +186,7 @@ int set_direct_map_default_noflush(struct page *page) #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return; set_memory_valid((unsigned long)page_address(page), numpages, enable); @@ -206,7 +211,7 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return true; pgdp = pgd_offset_k(addr); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a8ad8eb76120..c426e7d20907 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -26,8 +26,8 @@ config RISCV select ARCH_HAS_KCOV select ARCH_HAS_MMIOWB select ARCH_HAS_PTE_SPECIAL - select ARCH_HAS_SET_DIRECT_MAP - select ARCH_HAS_SET_MEMORY + select ARCH_HAS_SET_DIRECT_MAP if MMU + select ARCH_HAS_SET_MEMORY if MMU select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 977ee6181dab..6c316093a1e5 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -9,6 +9,7 @@ */ #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_MEMFD_SECRET #include <uapi/asm/unistd.h> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4bbc267fb36b..811b0fdb4adf 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -451,3 +451,4 @@ 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self +447 i386 memfd_secret sys_memfd_secret diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ce18119ea0d0..2f9296382df8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -368,6 +368,7 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 299c20f0a38b..ea4fe192189d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -69,7 +69,7 @@ static void printk_stack_address(unsigned long address, int reliable, const char *log_lvl) { touch_nmi_watchdog(); - printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); + printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address); } static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 40232f90db6e..3b7a0ff4642f 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -8,5 +8,13 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); + +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX]; +void init_vmlinux_build_id(void); +#else +static inline void init_vmlinux_build_id(void) { } +#endif #endif diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 206bde8308b2..de62a722431e 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -38,8 +38,12 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_OSRELEASE(value) \ vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_BUILD_ID(value) \ - vmcoreinfo_append_str("BUILD-ID=%s\n", value) +#define VMCOREINFO_BUILD_ID() \ + ({ \ + static_assert(sizeof(vmlinux_build_id) == 20); \ + vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \ + }) + #define VMCOREINFO_PAGESIZE(value) \ vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ @@ -69,10 +73,6 @@ extern unsigned char *vmcoreinfo_data; extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; -/* raw contents of kernel .notes section */ -extern const void __start_notes __weak; -extern const void __stop_notes __weak; - Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 5cdc6903abca..1a7968cf7679 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -7,6 +7,7 @@ #define _LINUX_KALLSYMS_H #include <linux/errno.h> +#include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> @@ -15,8 +16,9 @@ #include <asm/sections.h> #define KSYM_NAME_LEN 512 -#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ - 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) +#define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + (KSYM_NAME_LEN - 1) + \ + 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ + (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; @@ -91,8 +93,10 @@ const char *kallsyms_lookup(unsigned long addr, /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); +extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); +extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); @@ -128,6 +132,12 @@ static inline int sprint_symbol(char *buffer, unsigned long addr) return 0; } +static inline int sprint_symbol_build_id(char *buffer, unsigned long address) +{ + *buffer = '\0'; + return 0; +} + static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; @@ -140,6 +150,12 @@ static inline int sprint_backtrace(char *buffer, unsigned long addr) return 0; } +static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) +{ + *buffer = '\0'; + return 0; +} + static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; diff --git a/include/linux/module.h b/include/linux/module.h index 8100bb477d86..8a298d820dbc 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/stat.h> +#include <linux/buildid.h> #include <linux/compiler.h> #include <linux/cache.h> #include <linux/kmod.h> @@ -369,6 +370,11 @@ struct module { /* Unique handle for this module */ char name[MODULE_NAME_LEN]; +#ifdef CONFIG_STACKTRACE_BUILD_ID + /* Module build ID */ + unsigned char build_id[BUILD_ID_SIZE_MAX]; +#endif + /* Sysfs stuff. */ struct module_kobject mkobj; struct module_attribute *modinfo_attrs; @@ -636,7 +642,7 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr); const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, - char **modname, + char **modname, const unsigned char **modbuildid, char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); @@ -740,6 +746,7 @@ static inline const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, + const unsigned char **modbuildid, char *namebuf) { return NULL; diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h new file mode 100644 index 000000000000..21c3771e6a56 --- /dev/null +++ b/include/linux/secretmem.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SECRETMEM_H +#define _LINUX_SECRETMEM_H + +#ifdef CONFIG_SECRETMEM + +extern const struct address_space_operations secretmem_aops; + +static inline bool page_is_secretmem(struct page *page) +{ + struct address_space *mapping; + + /* + * Using page_mapping() is quite slow because of the actual call + * instruction and repeated compound_head(page) inside the + * page_mapping() function. + * We know that secretmem pages are not compound and LRU so we can + * save a couple of cycles here. + */ + if (PageCompound(page) || !PageLRU(page)) + return false; + + mapping = (struct address_space *) + ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); + + if (mapping != page->mapping) + return false; + + return mapping->a_ops == &secretmem_aops; +} + +bool vma_is_secretmem(struct vm_area_struct *vma); +bool secretmem_active(void); + +#else + +static inline bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool page_is_secretmem(struct page *page) +{ + return false; +} + +static inline bool secretmem_active(void) +{ + return false; +} + +#endif /* CONFIG_SECRETMEM */ + +#endif /* _LINUX_SECRETMEM_H */ diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index fe1aa4e54680..f36be5166c19 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -28,7 +28,19 @@ static inline bool kernel_page_present(struct page *page) { return true; } +#else /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ +/* + * Some architectures, e.g. ARM64 can disable direct map modifications at + * boot time. Let them overrive this query. + */ +#ifndef can_set_direct_map +static inline bool can_set_direct_map(void) +{ + return true; +} +#define can_set_direct_map can_set_direct_map #endif +#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifndef set_mce_nospec static inline int set_mce_nospec(unsigned long pfn, bool unmap) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 050511e8f1f8..1a1b5d724497 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1050,6 +1050,7 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); +asmlinkage long sys_memfd_secret(unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 6de5a7fc066b..28b388368cf6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -873,8 +873,13 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#ifdef __ARCH_WANT_MEMFD_SECRET +#define __NR_memfd_secret 447 +__SYSCALL(__NR_memfd_secret, sys_memfd_secret) +#endif + #undef __NR_syscalls -#define __NR_syscalls 447 +#define __NR_syscalls 448 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f3956fc11de6..35687dcb1a42 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -97,5 +97,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define Z3FOLD_MAGIC 0x33 #define PPC_CMM_MAGIC 0xc7571590 +#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/Kconfig b/init/Kconfig index b075234001a3..173a474012d7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1853,6 +1853,7 @@ config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT depends on SLUB && SYSFS + select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can result in significant savings in code size. This also disables diff --git a/init/main.c b/init/main.c index 6778066e2034..b2e6a8ac2b2d 100644 --- a/init/main.c +++ b/init/main.c @@ -45,6 +45,7 @@ #include <linux/srcu.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> +#include <linux/buildid.h> #include <linux/writeback.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -925,6 +926,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); + init_vmlinux_build_id(); cgroup_init_early(); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 825284baaf46..29cc15398ee4 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,7 @@ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ +#include <linux/buildid.h> #include <linux/crash_core.h> #include <linux/utsname.h> #include <linux/vmalloc.h> @@ -378,53 +379,6 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); -#define NOTES_SIZE (&__stop_notes - &__start_notes) -#define BUILD_ID_MAX SHA1_DIGEST_SIZE -#define NT_GNU_BUILD_ID 3 - -struct elf_note_section { - struct elf_note n_hdr; - u8 n_data[]; -}; - -/* - * Add build ID from .notes section as generated by the GNU ld(1) - * or LLVM lld(1) --build-id option. - */ -static void add_build_id_vmcoreinfo(void) -{ - char build_id[BUILD_ID_MAX * 2 + 1]; - int n_remain = NOTES_SIZE; - - while (n_remain >= sizeof(struct elf_note)) { - const struct elf_note_section *note_sec = - &__start_notes + NOTES_SIZE - n_remain; - const u32 n_namesz = note_sec->n_hdr.n_namesz; - - if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID && - n_namesz != 0 && - !strcmp((char *)¬e_sec->n_data[0], "GNU")) { - if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) { - const u32 n_descsz = note_sec->n_hdr.n_descsz; - const u8 *s = ¬e_sec->n_data[n_namesz]; - - s = PTR_ALIGN(s, 4); - bin2hex(build_id, s, n_descsz); - build_id[2 * n_descsz] = '\0'; - VMCOREINFO_BUILD_ID(build_id); - return; - } - pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n", - note_sec->n_hdr.n_descsz, - BUILD_ID_MAX); - return; - } - n_remain -= sizeof(struct elf_note) + - ALIGN(note_sec->n_hdr.n_namesz, 4) + - ALIGN(note_sec->n_hdr.n_descsz, 4); - } -} - static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -443,7 +397,7 @@ static int __init crash_save_vmcoreinfo_init(void) } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - add_build_id_vmcoreinfo(); + VMCOREINFO_BUILD_ID(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 9d0c23e1993c..1355b0404fcd 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -25,7 +25,10 @@ #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> +#include <linux/build_bug.h> #include <linux/compiler.h> +#include <linux/module.h> +#include <linux/kernel.h> /* * These will be re-linked against their real values @@ -304,21 +307,13 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, get_symbol_pos(addr, symbolsize, offset); return 1; } - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -/* - * Lookup an address - * - modname is set to NULL if it's in the kernel. - * - We guarantee that the returned name is valid until we reschedule even if. - * It resides in a module. - * - We also guarantee that modname will be valid until rescheduled. - */ -const char *kallsyms_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, char *namebuf) +const char *kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, + unsigned long *offset, char **modname, + const unsigned char **modbuildid, char *namebuf) { const char *ret; @@ -334,6 +329,8 @@ const char *kallsyms_lookup(unsigned long addr, namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; + if (modbuildid) + *modbuildid = NULL; ret = namebuf; goto found; @@ -341,7 +338,7 @@ const char *kallsyms_lookup(unsigned long addr, /* See if it's in a module or a BPF JITed image. */ ret = module_address_lookup(addr, symbolsize, offset, - modname, namebuf); + modname, modbuildid, namebuf); if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); @@ -355,6 +352,22 @@ found: return ret; } +/* + * Lookup an address + * - modname is set to NULL if it's in the kernel. + * - We guarantee that the returned name is valid until we reschedule even if. + * It resides in a module. + * - We also guarantee that modname will be valid until rescheduled. + */ +const char *kallsyms_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, char *namebuf) +{ + return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, + NULL, namebuf); +} + int lookup_symbol_name(unsigned long addr, char *symname) { int res; @@ -411,15 +424,17 @@ found: /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, - int symbol_offset, int add_offset) + int symbol_offset, int add_offset, int add_buildid) { char *modname; + const unsigned char *buildid; const char *name; unsigned long offset, size; int len; address += symbol_offset; - name = kallsyms_lookup(address, &size, &offset, &modname, buffer); + name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, + buffer); if (!name) return sprintf(buffer, "0x%lx", address - symbol_offset); @@ -431,8 +446,19 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (add_offset) len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); - if (modname) - len += sprintf(buffer + len, " [%s]", modname); + if (modname) { + len += sprintf(buffer + len, " [%s", modname); +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + if (add_buildid && buildid) { + /* build ID should match length of sprintf */ +#if IS_ENABLED(CONFIG_MODULES) + static_assert(sizeof(typeof_member(struct module, build_id)) == 20); +#endif + len += sprintf(buffer + len, " %20phN", buildid); + } +#endif + len += sprintf(buffer + len, "]"); + } return len; } @@ -450,11 +476,28 @@ static int __sprint_symbol(char *buffer, unsigned long address, */ int sprint_symbol(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, 0, 1); + return __sprint_symbol(buffer, address, 0, 1, 0); } EXPORT_SYMBOL_GPL(sprint_symbol); /** + * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size, module name and module build ID to @buffer if possible. If no + * symbol was found, just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol_build_id(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0, 1, 1); +} +EXPORT_SYMBOL_GPL(sprint_symbol_build_id); + +/** * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup @@ -467,7 +510,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol); */ int sprint_symbol_no_offset(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, 0, 0); + return __sprint_symbol(buffer, address, 0, 0, 0); } EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); @@ -487,7 +530,27 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); */ int sprint_backtrace(char *buffer, unsigned long address) { - return __sprint_symbol(buffer, address, -1, 1); + return __sprint_symbol(buffer, address, -1, 1, 0); +} + +/** + * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. This function also appends the module build ID to + * the @buffer if @address is within a kernel module. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace_build_id(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1, 1, 1); } /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ diff --git a/kernel/module.c b/kernel/module.c index 1d0e59f95a9a..decf4601e943 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -13,6 +13,7 @@ #include <linux/trace_events.h> #include <linux/init.h> #include <linux/kallsyms.h> +#include <linux/buildid.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/sysfs.h> @@ -2793,6 +2794,26 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) } #endif /* CONFIG_KALLSYMS */ +#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +static void init_build_id(struct module *mod, const struct load_info *info) +{ + const Elf_Shdr *sechdr; + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) { + sechdr = &info->sechdrs[i]; + if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && + !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, + sechdr->sh_size)) + break; + } +} +#else +static void init_build_id(struct module *mod, const struct load_info *info) +{ +} +#endif + static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) { if (!debug) @@ -4017,6 +4038,7 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } + init_build_id(mod, info); dynamic_debug_setup(mod, info->debug, info->num_debug); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ @@ -4250,6 +4272,7 @@ const char *module_address_lookup(unsigned long addr, unsigned long *size, unsigned long *offset, char **modname, + const unsigned char **modbuildid, char *namebuf) { const char *ret = NULL; @@ -4260,6 +4283,13 @@ const char *module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; + if (modbuildid) { +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + *modbuildid = mod->build_id; +#else + *modbuildid = NULL; +#endif + } ret = find_kallsyms_symbol(mod, addr, size, offset); } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index da0b41914177..559acef3fddb 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -31,6 +31,7 @@ #include <linux/genhd.h> #include <linux/ktime.h> #include <linux/security.h> +#include <linux/secretmem.h> #include <trace/events/power.h> #include "power.h" @@ -81,7 +82,9 @@ void hibernate_release(void) bool hibernation_available(void) { - return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); + return nohibernate == 0 && + !security_locked_down(LOCKDOWN_HIBERNATION) && + !secretmem_active(); } /** diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0ea8128468c3..4d7e377a74f3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -358,6 +358,8 @@ COND_SYSCALL(pkey_mprotect); COND_SYSCALL(pkey_alloc); COND_SYSCALL(pkey_free); +/* memfd_secret */ +COND_SYSCALL(memfd_secret); /* * Architecture specific weak syscall entries. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4dcbf5ffa69d..24820f1d1f2a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -35,6 +35,17 @@ config PRINTK_CALLER no option to enable/disable at the kernel command line parameter or sysfs interface. +config STACKTRACE_BUILD_ID + bool "Show build ID information in stacktraces" + depends on PRINTK + help + Selecting this option adds build ID information for symbols in + stacktraces printed with the printk format '%p[SR]b'. + + This option is intended for distros where debuginfo is not easily + accessible but can be downloaded given the build ID of the vmlinux or + kernel module where the function is located. + config CONSOLE_LOGLEVEL_DEFAULT int "Default console loglevel (1-15)" range 1 15 diff --git a/lib/buildid.c b/lib/buildid.c index 6156997c3895..dfc62625cae4 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -1,36 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/buildid.h> +#include <linux/cache.h> #include <linux/elf.h> +#include <linux/kernel.h> #include <linux/pagemap.h> #define BUILD_ID 3 + /* * Parse build id from the note segment. This logic can be shared between * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are * identical. */ -static inline int parse_build_id(void *page_addr, - unsigned char *build_id, - __u32 *size, - void *note_start, - Elf32_Word note_size) +static int parse_build_id_buf(unsigned char *build_id, + __u32 *size, + const void *note_start, + Elf32_Word note_size) { Elf32_Word note_offs = 0, new_offs; - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); if (nhdr->n_type == BUILD_ID && nhdr->n_namesz == sizeof("GNU") && + !strcmp((char *)(nhdr + 1), "GNU") && nhdr->n_descsz > 0 && nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { memcpy(build_id, @@ -49,11 +44,29 @@ static inline int parse_build_id(void *page_addr, break; note_offs = new_offs; } + return -EINVAL; } +static inline int parse_build_id(const void *page_addr, + unsigned char *build_id, + __u32 *size, + const void *note_start, + Elf32_Word note_size) +{ + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + return parse_build_id_buf(build_id, size, note_start, note_size); +} + /* Parse build ID from 32-bit ELF */ -static int get_build_id_32(void *page_addr, unsigned char *build_id, +static int get_build_id_32(const void *page_addr, unsigned char *build_id, __u32 *size) { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; @@ -78,7 +91,7 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id, } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(void *page_addr, unsigned char *build_id, +static int get_build_id_64(const void *page_addr, unsigned char *build_id, __u32 *size) { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; @@ -108,7 +121,7 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id, * @build_id: buffer to store build id, at least BUILD_ID_SIZE long * @size: returns actual build id size in case of success * - * Returns 0 on success, otherwise error (< 0). + * Return: 0 on success, -EINVAL otherwise */ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) @@ -147,3 +160,32 @@ out: put_page(page); return ret; } + +/** + * build_id_parse_buf - Get build ID from a buffer + * @buf: Elf note section(s) to parse + * @buf_size: Size of @buf in bytes + * @build_id: Build ID parsed from @buf, at least BUILD_ID_SIZE_MAX long + * + * Return: 0 on success, -EINVAL otherwise + */ +int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size) +{ + return parse_build_id_buf(build_id, NULL, buf, buf_size); +} + +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE) +unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX] __ro_after_init; + +/** + * init_vmlinux_build_id - Compute and stash the running kernel's build ID + */ +void __init init_vmlinux_build_id(void) +{ + extern const void __start_notes __weak; + extern const void __stop_notes __weak; + unsigned int size = &__stop_notes - &__start_notes; + + build_id_parse_buf(&__start_notes, vmlinux_build_id, size); +} +#endif diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 586e3f2c6a15..6e7ca3d67710 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -5,6 +5,7 @@ */ #include <linux/kernel.h> +#include <linux/buildid.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -36,6 +37,14 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...) va_end(args); } +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +#define BUILD_ID_FMT " %20phN" +#define BUILD_ID_VAL vmlinux_build_id +#else +#define BUILD_ID_FMT "%s" +#define BUILD_ID_VAL "" +#endif + /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level @@ -45,13 +54,13 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...) */ void dump_stack_print_info(const char *log_lvl) { - printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n", + printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, BUILD_ID_VAL); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ea65ec51e63b..b8c92fee4d32 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -992,8 +992,12 @@ char *symbol_string(char *buf, char *end, void *ptr, value = (unsigned long)ptr; #ifdef CONFIG_KALLSYMS - if (*fmt == 'B') + if (*fmt == 'B' && fmt[1] == 'b') + sprint_backtrace_build_id(sym, value); + else if (*fmt == 'B') sprint_backtrace(sym, value); + else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b'))) + sprint_symbol_build_id(sym, value); else if (*fmt != 's') sprint_symbol(sym, value); else @@ -2262,9 +2266,11 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * - 'S' For symbolic direct pointers (or function descriptors) with offset * - 's' For symbolic direct pointers (or function descriptors) without offset * - '[Ss]R' as above with __builtin_extract_return_addr() translation + * - 'S[R]b' as above with module build ID (for use in backtraces) * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of * %ps and %pS. Be careful when re-using these specifiers. * - 'B' For backtraced symbolic direct pointers with offset + * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'b[l]' For a bitmap, the number of bits is determined by the field diff --git a/mm/Kconfig b/mm/Kconfig index 75f590bd5894..a070bf0bab96 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -904,4 +904,8 @@ config KMAP_LOCAL # struct io_mapping based helper. Selected by drivers that need them config IO_MAPPING bool + +config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + endmenu diff --git a/mm/Makefile b/mm/Makefile index 74b47c354682..e3436741d539 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o @@ -10,6 +10,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> @@ -827,6 +828,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct follow_page_context ctx = { NULL }; struct page *page; + if (vma_is_secretmem(vma)) + return NULL; + page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -960,6 +964,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -2142,6 +2149,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (!head) goto pte_unmap; + if (unlikely(page_is_secretmem(page))) { + put_compound_head(head, 1, flags); + goto pte_unmap; + } + if (unlikely(pte_val(pte) != pte_val(*ptep))) { put_compound_head(head, 1, flags); goto pte_unmap; diff --git a/mm/internal.h b/mm/internal.h index d0fafb7114dd..c69b4b60dacf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -360,6 +360,9 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern unsigned int munlock_vma_page(struct page *page); +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); + /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/mlock.c b/mm/mlock.c index df590fda5688..5e9f4dea4e96 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -23,6 +23,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> +#include <linux/secretmem.h> #include "internal.h" @@ -503,7 +504,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma)) + vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index d8c92ae50565..84999873cfa4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1352,9 +1352,8 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -static inline int mlock_future_check(struct mm_struct *mm, - unsigned long flags, - unsigned long len) +int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len) { unsigned long locked, lock_limit; diff --git a/mm/secretmem.c b/mm/secretmem.c new file mode 100644 index 000000000000..f77d25467a14 --- /dev/null +++ b/mm/secretmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/swap.h> +#include <linux/mount.h> +#include <linux/memfd.h> +#include <linux/bitops.h> +#include <linux/printk.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/pseudo_fs.h> +#include <linux/secretmem.h> +#include <linux/set_memory.h> +#include <linux/sched/signal.h> + +#include <uapi/linux/magic.h> + +#include <asm/tlbflush.h> + +#include "internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "secretmem: " fmt + +/* + * Define mode and flag masks to allow validation of the system call + * parameters. + */ +#define SECRETMEM_MODE_MASK (0x0) +#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK + +static bool secretmem_enable __ro_after_init; +module_param_named(enable, secretmem_enable, bool, 0400); +MODULE_PARM_DESC(secretmem_enable, + "Enable secretmem and memfd_secret(2) system call"); + +static atomic_t secretmem_users; + +bool secretmem_active(void) +{ + return !!atomic_read(&secretmem_users); +} + +static vm_fault_t secretmem_fault(struct vm_fault *vmf) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct inode *inode = file_inode(vmf->vma->vm_file); + pgoff_t offset = vmf->pgoff; + gfp_t gfp = vmf->gfp_mask; + unsigned long addr; + struct page *page; + int err; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return vmf_error(-EINVAL); + +retry: + page = find_lock_page(mapping, offset); + if (!page) { + page = alloc_page(gfp | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + + err = set_direct_map_invalid_noflush(page); + if (err) { + put_page(page); + return vmf_error(err); + } + + __SetPageUptodate(page); + err = add_to_page_cache_lru(page, mapping, offset, gfp); + if (unlikely(err)) { + put_page(page); + /* + * If a split of large page was required, it + * already happened when we marked the page invalid + * which guarantees that this call won't fail + */ + set_direct_map_default_noflush(page); + if (err == -EEXIST) + goto retry; + + return vmf_error(err); + } + + addr = (unsigned long)page_address(page); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static const struct vm_operations_struct secretmem_vm_ops = { + .fault = secretmem_fault, +}; + +static int secretmem_release(struct inode *inode, struct file *file) +{ + atomic_dec(&secretmem_users); + return 0; +} + +static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long len = vma->vm_end - vma->vm_start; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + return -EINVAL; + + if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + return -EAGAIN; + + vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vma->vm_ops = &secretmem_vm_ops; + + return 0; +} + +bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &secretmem_vm_ops; +} + +static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +}; + +static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) +{ + return false; +} + +static int secretmem_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return -EBUSY; +} + +static void secretmem_freepage(struct page *page) +{ + set_direct_map_default_noflush(page); + clear_highpage(page); +} + +const struct address_space_operations secretmem_aops = { + .freepage = secretmem_freepage, + .migratepage = secretmem_migratepage, + .isolate_page = secretmem_isolate_page, +}; + +static struct vfsmount *secretmem_mnt; + +static struct file *secretmem_file_create(unsigned long flags) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct inode *inode; + + inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", + O_RDWR, &secretmem_fops); + if (IS_ERR(file)) + goto err_free_inode; + + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ + inode->i_mode |= S_IFREG; + inode->i_size = 0; + + return file; + +err_free_inode: + iput(inode); + return file; +} + +SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) +{ + struct file *file; + int fd, err; + + /* make sure local flags do not confict with global fcntl.h */ + BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); + + if (!secretmem_enable) + return -ENOSYS; + + if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) + return -EINVAL; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + file = secretmem_file_create(flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_put_fd; + } + + file->f_flags |= O_LARGEFILE; + + fd_install(fd, file); + atomic_inc(&secretmem_users); + return fd; + +err_put_fd: + put_unused_fd(fd); + return err; +} + +static int secretmem_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type secretmem_fs = { + .name = "secretmem", + .init_fs_context = secretmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int secretmem_init(void) +{ + int ret = 0; + + if (!secretmem_enable) + return ret; + + secretmem_mnt = kern_mount(&secretmem_fs); + if (IS_ERR(secretmem_mnt)) + ret = PTR_ERR(secretmem_mnt); + + /* prevent secretmem mappings from ever getting PROT_EXEC */ + secretmem_mnt->mnt_flags |= MNT_NOEXEC; + + return ret; +} +fs_initcall(secretmem_init); diff --git a/mm/slub.c b/mm/slub.c index e2ef8ae0d062..ee51857d8e9b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -25,6 +25,7 @@ #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/kfence.h> @@ -204,8 +205,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -603,22 +604,27 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } +#ifdef CONFIG_STACKDEPOT +static depot_stack_handle_t save_stack_depot_trace(gfp_t flags) +{ + unsigned long entries[TRACK_ADDRS_COUNT]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 4); + handle = stack_depot_save(entries, nr_entries, flags); + return handle; +} +#endif + static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { struct track *p = get_track(s, object, alloc); if (addr) { -#ifdef CONFIG_STACKTRACE - unsigned int nr_entries; - - metadata_access_enable(); - nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), - TRACK_ADDRS_COUNT, 3); - metadata_access_disable(); - - if (nr_entries < TRACK_ADDRS_COUNT) - p->addrs[nr_entries] = 0; +#ifdef CONFIG_STACKDEPOT + p->handle = save_stack_depot_trace(GFP_NOWAIT); #endif p->addr = addr; p->cpu = smp_processor_id(); @@ -645,14 +651,19 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACKDEPOT { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - pr_err("\t%pS\n", (void *)t->addrs[i]); - else - break; + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + + handle = READ_ONCE(t->handle); + if (!handle) { + pr_err("object allocation/free stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); + } } #endif } @@ -4028,18 +4039,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKTRACE - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_stack[i]) - break; - } +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; - trackp = get_track(s, objp, TRACK_FREE); - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_free_stack[i]) - break; + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } #endif #endif diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index a18b47695f55..b7609958ee36 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -40,6 +40,10 @@ cat << EOF #define __IGNORE_setrlimit /* setrlimit */ #endif +#ifndef __ARCH_WANT_MEMFD_SECRET +#define __IGNORE_memfd_secret +#endif + /* Missing flags argument */ #define __IGNORE_renameat /* renameat2 */ diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 90398347e366..5fbad61fe490 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -3,11 +3,10 @@ # (c) 2014, Sasha Levin <sasha.levin@oracle.com> #set -x -if [[ $# < 1 ]]; then +usage() { echo "Usage:" - echo " $0 -r <release> | <vmlinux> [base path] [modules path]" - exit 1 -fi + echo " $0 -r <release> | <vmlinux> [<base path>|auto] [<modules path>]" +} if [[ $1 == "-r" ]] ; then vmlinux="" @@ -24,6 +23,7 @@ if [[ $1 == "-r" ]] ; then if [[ $vmlinux == "" ]] ; then echo "ERROR! vmlinux image for release $release is not found" >&2 + usage exit 2 fi else @@ -31,12 +31,35 @@ else basepath=${2-auto} modpath=$3 release="" + debuginfod= + + # Can we use debuginfod-find? + if type debuginfod-find >/dev/null 2>&1 ; then + debuginfod=${1-only} + fi + + if [[ $vmlinux == "" && -z $debuginfod ]] ; then + echo "ERROR! vmlinux image must be specified" >&2 + usage + exit 1 + fi fi declare -A cache declare -A modcache find_module() { + if [[ -n $debuginfod ]] ; then + if [[ -n $modbuildid ]] ; then + debuginfod-find debuginfo $modbuildid && return + fi + + # Only using debuginfod so don't try to find vmlinux module path + if [[ $debuginfod == "only" ]] ; then + return + fi + fi + if [[ "$modpath" != "" ]] ; then for fn in $(find "$modpath" -name "${module//_/[-_]}.ko*") ; do if readelf -WS "$fn" | grep -qwF .debug_line ; then @@ -51,7 +74,7 @@ find_module() { find_module && return if [[ $release == "" ]] ; then - release=$(gdb -ex 'print init_uts_ns.name.release' -ex 'quit' -quiet -batch "$vmlinux" | sed -n 's/\$1 = "\(.*\)".*/\1/p') + release=$(gdb -ex 'print init_uts_ns.name.release' -ex 'quit' -quiet -batch "$vmlinux" 2>/dev/null | sed -n 's/\$1 = "\(.*\)".*/\1/p') fi for dn in {/usr/lib/debug,}/lib/modules/$release ; do @@ -105,7 +128,7 @@ parse_symbol() { if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then local base_addr=${cache[$module,$name]} else - local base_addr=$(nm "$objfile" | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}') + local base_addr=$(nm "$objfile" 2>/dev/null | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}') if [[ $base_addr == "" ]] ; then # address not found return @@ -129,7 +152,7 @@ parse_symbol() { if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then local code=${cache[$module,$address]} else - local code=$(${CROSS_COMPILE}addr2line -i -e "$objfile" "$address") + local code=$(${CROSS_COMPILE}addr2line -i -e "$objfile" "$address" 2>/dev/null) cache[$module,$address]=$code fi @@ -150,6 +173,27 @@ parse_symbol() { symbol="$segment$name ($code)" } +debuginfod_get_vmlinux() { + local vmlinux_buildid=${1##* } + + if [[ $vmlinux != "" ]]; then + return + fi + + if [[ $vmlinux_buildid =~ ^[0-9a-f]+ ]]; then + vmlinux=$(debuginfod-find debuginfo $vmlinux_buildid) + if [[ $? -ne 0 ]] ; then + echo "ERROR! vmlinux image not found via debuginfod-find" >&2 + usage + exit 2 + fi + return + fi + echo "ERROR! Build ID for vmlinux not found. Try passing -r or specifying vmlinux" >&2 + usage + exit 2 +} + decode_code() { local scripts=`dirname "${BASH_SOURCE[0]}"` @@ -157,6 +201,14 @@ decode_code() { } handle_line() { + if [[ $basepath == "auto" && $vmlinux != "" ]] ; then + module="" + symbol="kernel_init+0x0/0x0" + parse_symbol + basepath=${symbol#kernel_init (} + basepath=${basepath%/init/main.c:*)} + fi + local words # Tokenize @@ -182,16 +234,28 @@ handle_line() { fi done + if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then + words[$last-1]="${words[$last-1]} ${words[$last]}" + unset words[$last] + last=$(( $last - 1 )) + fi + if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then module=${words[$last]} module=${module#\[} module=${module%\]} + modbuildid=${module#* } + module=${module% *} + if [[ $modbuildid == $module ]]; then + modbuildid= + fi symbol=${words[$last-1]} unset words[$last-1] else # The symbol is the last element, process it symbol=${words[$last]} module= + modbuildid= fi unset words[$last] @@ -201,14 +265,6 @@ handle_line() { echo "${words[@]}" "$symbol $module" } -if [[ $basepath == "auto" ]] ; then - module="" - symbol="kernel_init+0x0/0x0" - parse_symbol - basepath=${symbol#kernel_init (} - basepath=${basepath%/init/main.c:*)} -fi - while read line; do # Let's see if we have an address in the line if [[ $line =~ \[\<([^]]+)\>\] ]] || @@ -218,6 +274,9 @@ while read line; do # Is it a code line? elif [[ $line == *Code:* ]]; then decode_code "$line" + # Is it a version line? + elif [[ -n $debuginfod && $line =~ PID:\ [0-9]+\ Comm: ]]; then + debuginfod_get_vmlinux "$line" else # Nothing special in this line, show it as is echo "$line" diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index d683a49d07d5..f0fd80ef17df 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -24,5 +24,6 @@ va_128TBswitch map_fixed_noreplace write_to_hugetlbfs hmm-tests +memfd_secret local_config.* split_huge_page_test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 8be00319beca..4d61d574c9d6 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -35,6 +35,7 @@ TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mremap_dontunmap @@ -135,7 +136,7 @@ warn_32bit_failure: endif endif -$(OUTPUT)/mlock-random-test: LDLIBS += -lcap +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap $(OUTPUT)/gup_test: ../../../../mm/gup_test.h diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c new file mode 100644 index 000000000000..93e7e7ffed33 --- /dev/null +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#define _GNU_SOURCE +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/capability.h> + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stdio.h> + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_exit(!ksft_get_fail_cnt()); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 955782d138ab..d09a6b71f1e9 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -362,4 +362,21 @@ else exitcode=1 fi +echo "running memfd_secret test" +echo "------------------------------------" +./memfd_secret +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +exit $exitcode + exit $exitcode |