From c277fe7c9fad978f4050cac98598f57bebd0504a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:23 +1000 Subject: net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_table_info() xt_alloc_table_info() basically opencodes kvmalloc() so use the library function instead. Link: http://lkml.kernel.org/r/20170531155145.17111-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Signed-off-by: Andrew Morton --- net/netfilter/x_tables.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 1770c1d9b37f..e1648238a9c9 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1003,14 +1003,10 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - if (!info) { - info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); - if (!info) - return NULL; - } + info = kvmalloc(sz, GFP_KERNEL); + if (!info) + return NULL; + memset(info, 0, sizeof(*info)); info->size = size; return info; -- cgit v1.2.3 From 5ac7d7a2e0703f21f8c9eb7c9575e64bf2858c26 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:23 +1000 Subject: kernel/watchdog: remove unused declaration Patch series "Improve watchdog config for arch watchdogs", v4. A series to make the hardlockup watchdog more easily replaceable by arch code. The last patch provides some justification for why we want to do this (existing sparc watchdog is another that could benefit). This patch (of 5): Remove unused declaration. Link: http://lkml.kernel.org/r/20170616065715.18390-2-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/nmi.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/nmi.h b/include/linux/nmi.h index aa3cd0878270..5e2e57536d98 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -12,9 +12,6 @@ extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern unsigned int hardlockup_panic; void lockup_detector_init(void); -- cgit v1.2.3 From 13a0e38a2c2a2e9b36c577674f907e91013a3b33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:23 +1000 Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog() For architectures that define HAVE_NMI_WATCHDOG, instead of having them provide the complete touch_nmi_watchdog() function, just have them provide arch_touch_nmi_watchdog(). This gives the generic code more flexibility in implementing this function, and arch implementations don't miss out on touching the softlockup watchdog or other generic details. Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/blackfin/include/asm/nmi.h | 2 ++ arch/blackfin/kernel/nmi.c | 2 +- arch/mn10300/include/asm/nmi.h | 2 ++ arch/mn10300/kernel/mn10300-watchdog-low.S | 8 ++++---- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/sparc/include/asm/nmi.h | 1 + arch/sparc/kernel/nmi.c | 6 ++---- include/linux/nmi.h | 27 ++++++++++++++++----------- kernel/watchdog_hld.c | 5 ++--- 9 files changed, 31 insertions(+), 24 deletions(-) diff --git a/arch/blackfin/include/asm/nmi.h b/arch/blackfin/include/asm/nmi.h index b9caac4fcfd8..107d23705f46 100644 --- a/arch/blackfin/include/asm/nmi.h +++ b/arch/blackfin/include/asm/nmi.h @@ -9,4 +9,6 @@ #include +extern void arch_touch_nmi_watchdog(void); + #endif diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 633c37083e87..1e714329fe8a 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -190,7 +190,7 @@ static int __init init_nmi_wdt(void) } device_initcall(init_nmi_wdt); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { atomic_set(&nmi_touched[smp_processor_id()], 1); } diff --git a/arch/mn10300/include/asm/nmi.h b/arch/mn10300/include/asm/nmi.h index f3671cbbc117..b05627597b1b 100644 --- a/arch/mn10300/include/asm/nmi.h +++ b/arch/mn10300/include/asm/nmi.h @@ -11,4 +11,6 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +extern void arch_touch_nmi_watchdog(void); + #endif /* _ASM_NMI_H */ diff --git a/arch/mn10300/kernel/mn10300-watchdog-low.S b/arch/mn10300/kernel/mn10300-watchdog-low.S index f2f5c9cfaabd..34f8773de7d0 100644 --- a/arch/mn10300/kernel/mn10300-watchdog-low.S +++ b/arch/mn10300/kernel/mn10300-watchdog-low.S @@ -50,9 +50,9 @@ watchdog_handler: # we can't inline it) # ############################################################################### - .globl touch_nmi_watchdog - .type touch_nmi_watchdog,@function -touch_nmi_watchdog: + .globl arch_touch_nmi_watchdog + .type arch_touch_nmi_watchdog,@function +arch_touch_nmi_watchdog: clr d0 clr d1 mov watchdog_alert_counter, a0 @@ -63,4 +63,4 @@ touch_nmi_watchdog: lne ret [],0 - .size touch_nmi_watchdog,.-touch_nmi_watchdog + .size arch_touch_nmi_watchdog,.-arch_touch_nmi_watchdog diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index a2d8e6938d67..0d5641beadf5 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -31,7 +31,7 @@ static unsigned int watchdog; static unsigned int watchdog_hz = 1; unsigned int watchdog_alert_counter[NR_CPUS]; -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); /* * the best way to detect whether a CPU has a 'hard lockup' problem diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h index 26ad2b2607c6..284eac3ffaf2 100644 --- a/arch/sparc/include/asm/nmi.h +++ b/arch/sparc/include/asm/nmi.h @@ -7,6 +7,7 @@ void nmi_adjust_hz(unsigned int new_hz); extern atomic_t nmi_active; +void arch_touch_nmi_watchdog(void); void start_nmi_watchdog(void *unused); void stop_nmi_watchdog(void *unused); diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 95e73c63c99d..048ad783ea3f 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -51,7 +51,7 @@ static DEFINE_PER_CPU(unsigned int, last_irq_sum); static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { if (atomic_read(&nmi_active)) { int cpu; @@ -61,10 +61,8 @@ void touch_nmi_watchdog(void) per_cpu(nmi_touch, cpu) = 1; } } - - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5e2e57536d98..bd387ef8bccd 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -6,6 +6,9 @@ #include #include +#if defined(CONFIG_HAVE_NMI_WATCHDOG) +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -58,6 +61,18 @@ static inline void reset_hung_task_detector(void) #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) +extern void hardlockup_detector_disable(void); +#else +static inline void hardlockup_detector_disable(void) {} +#endif + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +extern void arch_touch_nmi_watchdog(void); +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -65,21 +80,11 @@ static inline void reset_hung_task_detector(void) * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. */ -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -#include -extern void touch_nmi_watchdog(void); -#else static inline void touch_nmi_watchdog(void) { + arch_touch_nmi_watchdog(); touch_softlockup_watchdog(); } -#endif - -#if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void hardlockup_detector_disable(void); -#else -static inline void hardlockup_detector_disable(void) {} -#endif /* * Create trigger_all_cpu_backtrace() out of the arch-provided diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..90d688df6ce1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +66,8 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, -- cgit v1.2.3 From 08c700799117d994ef3e2033cb54b96e1d9c0502 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:23 +1000 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/Kconfig | 23 ++++ arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 2 +- include/linux/nmi.h | 29 +++-- kernel/Makefile | 2 +- kernel/sysctl.c | 31 +++--- kernel/watchdog.c | 243 +++++++++++++++++++++++++++-------------- kernel/watchdog_hld.c | 32 ------ lib/Kconfig.debug | 45 +++++--- 11 files changed, 252 insertions(+), 159 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index cae0958a2298..5a6eef079c64 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -288,6 +288,29 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. + +config HAVE_HARDLOCKUP_DETECTOR_PERF + bool + depends on HAVE_PERF_EVENTS_NMI + help + The arch chooses to use the generic perf-NMI-based hardlockup + detector. Must define HAVE_PERF_EVENTS_NMI. + +config HAVE_NMI_WATCHDOG + bool + help + The arch provides a low level NMI watchdog. It provides + asm/nmi.h, and defines its own arch_touch_nmi_watchdog(). + +config HAVE_HARDLOCKUP_DETECTOR_ARCH + bool + select HAVE_NMI_WATCHDOG + help + The arch chooses to provide its own hardlockup detector, which is + a superset of the HAVE_NMI_WATCHDOG. It also conforms to config + interfaces and parameters provided by hardlockup detector subsystem. + + config HAVE_PERF_REGS bool help diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7177a3f4f418..63ed758e1d20 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..074a075a9cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a18681353d..3d2b8ce54e00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include #include -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index bd387ef8bccd..8aa01fd859fb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -11,13 +11,21 @@ #endif #ifdef CONFIG_LOCKUP_DETECTOR +void lockup_detector_init(void); +#else +static inline void lockup_detector_init(void) +{ +} +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned int hardlockup_panic; -void lockup_detector_init(void); +extern int soft_watchdog_enabled; +extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { @@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } -static inline void lockup_detector_init(void) -{ -} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +extern unsigned int hardlockup_panic; #else static inline void hardlockup_detector_disable(void) {} #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); #else +#if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif +#endif /** * touch_nmi_watchdog - restart NMI watchdog timeout. @@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu) } #endif -#ifdef CONFIG_LOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh); +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern atomic_t watchdog_park_in_progress; +extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a5e3500d1e41..642c2af57e10 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -778,34 +778,45 @@ config DEBUG_SHIRQ menu "Debug Lockups and Hangs" config LOCKUP_DETECTOR - bool "Detect Hard and Soft Lockups" + bool + +config SOFTLOCKUP_DETECTOR + bool "Detect Soft Lockups" depends on DEBUG_KERNEL && !S390 + select LOCKUP_DETECTOR help Say Y here to enable the kernel to act as a watchdog to detect - hard and soft lockups. + soft lockups. Softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config HARDLOCKUP_DETECTOR_PERF + bool + select SOFTLOCKUP_DETECTOR + +# +# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard +# lockup detector rather than the perf based detector. +# +config HARDLOCKUP_DETECTOR + bool "Detect Hard Lockups" + depends on DEBUG_KERNEL && !S390 + depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH + select LOCKUP_DETECTOR + select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF + select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH + help + Say Y here to enable the kernel to act as a watchdog to detect + hard lockups. + Hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. - The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. - -config HARDLOCKUP_DETECTOR - def_bool y - depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR @@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL - default LOCKUP_DETECTOR + default SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in -- cgit v1.2.3 From c44715d36e4a5d91ff9b1ebd0d63fcae37689d38 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: watchdog-split-up-config-options-fix Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Signed-off-by: Andrew Morton --- arch/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 5a6eef079c64..fb9bd7d36b05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE config HAVE_NMI bool -config HAVE_NMI_WATCHDOG - depends on HAVE_NMI - bool # # An arch should select this if it provides all these things: # @@ -288,7 +285,6 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. - config HAVE_HARDLOCKUP_DETECTOR_PERF bool depends on HAVE_PERF_EVENTS_NMI @@ -297,6 +293,7 @@ config HAVE_HARDLOCKUP_DETECTOR_PERF detector. Must define HAVE_PERF_EVENTS_NMI. config HAVE_NMI_WATCHDOG + depends on HAVE_NMI bool help The arch provides a low level NMI watchdog. It provides @@ -310,7 +307,6 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH a superset of the HAVE_NMI_WATCHDOG. It also conforms to config interfaces and parameters provided by hardlockup detector subsystem. - config HAVE_PERF_REGS bool help -- cgit v1.2.3 From 87150a1429d93219259513c1e2cb331658a4f08e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs After reconfiguring watchdog sysctls etc., architecture specific watchdogs may not get all their parameters updated. watchdog_nmi_reconfigure() can be implemented to pull the new values in and set the arch NMI watchdog. Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/watchdog.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1fba9c3d66dc..06cd965f64d2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -123,6 +123,11 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +void __weak watchdog_nmi_reconfigure(void) +{ +} + + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Helper for online, unparked cpus. */ @@ -600,6 +605,12 @@ static void watchdog_disable_all_cpus(void) } } +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} + #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) { @@ -619,6 +630,11 @@ static void watchdog_disable_all_cpus(void) { } +static int watchdog_update_cpus(void) +{ + return 0; +} + static void set_sample_period(void) { } @@ -651,6 +667,8 @@ int lockup_detector_suspend(void) watchdog_enabled = 0; } + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); return ret; @@ -671,6 +689,8 @@ void lockup_detector_resume(void) if (watchdog_running && !watchdog_suspended) watchdog_unpark_threads(); + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); } @@ -696,6 +716,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -881,12 +903,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); -#endif } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.2.3 From bc223c9dcd188ae270f992600ce7214042179412 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: watchdog-provide-watchdog_reconfigure-for-arch-watchdogs-fix add code comments Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com Cc: Don Zickus Cc: Babu Moger Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/watchdog.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06cd965f64d2..36531025496f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) { @@ -123,6 +127,17 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ void __weak watchdog_nmi_reconfigure(void) { } -- cgit v1.2.3 From db4633c562ebab60d5aca8729b87289e57a27b16 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: kernel/watchdog: hide unused function watchdog_update_cpus() is defined unconditionally, but only used when CONFIG_SYSCTL is defined: kernel/watchdog.c:608:12: error: 'watchdog_update_cpus' defined but not used [-Werror=unused-function] This adds another #ifdef around it. Fixes: mmotm ("kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs") Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Don Zickus Cc: Ingo Molnar Cc: Babu Moger Cc: Nicholas Piggin Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- kernel/watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 36531025496f..cabe3e9fb620 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -620,11 +620,13 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL static int watchdog_update_cpus(void) { return smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask); } +#endif #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) @@ -645,10 +647,12 @@ static void watchdog_disable_all_cpus(void) { } +#ifdef CONFIG_SYSCTL static int watchdog_update_cpus(void) { return 0; } +#endif static void set_sample_period(void) { -- cgit v1.2.3 From 4bc49dd37bbb3af5cc0a775908e971c3a2683407 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: powerpc/64s: implement arch-specific hardlockup watchdog Implement an arch-speicfic watchdog rather than use the perf-based hardlockup detector. The new watchdog takes the soft-NMI directly, rather than going through perf. Perf interrupts are to be made maskable in future, so that would prevent the perf detector from working in those regions. Additionally, implement a SMP based detector where all CPUs watch one another by pinging a shared cpumask. This is because powerpc Book3S does not have a true periodic local NMI, but some platforms do implement a true NMI IPI. If a CPU is stuck with interrupts hard disabled, the soft-NMI watchdog does not work, but the SMP watchdog will. Even on platforms without a true NMI IPI to get a good trace from the stuck CPU, other CPUs will notice the lockup sufficiently to report it and panic. Link: http://lkml.kernel.org/r/20170616065715.18390-6-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 7 +- arch/powerpc/include/asm/nmi.h | 11 + arch/powerpc/include/asm/smp.h | 2 + arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/exceptions-64s.S | 30 ++- arch/powerpc/kernel/kvm.c | 7 + arch/powerpc/kernel/setup_64.c | 19 -- arch/powerpc/kernel/smp.c | 20 +- arch/powerpc/kernel/watchdog.c | 388 +++++++++++++++++++++++++++++++++++ 9 files changed, 460 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/kernel/watchdog.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 63ed758e1d20..fce2f4f20891 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -82,7 +82,7 @@ config NR_IRQS config NMI_IPI bool - depends on SMP && (DEBUGGER || KEXEC_CORE) + depends on SMP && (DEBUGGER || KEXEC_CORE || HARDLOCKUP_DETECTOR) default y config STACKTRACE_SUPPORT @@ -192,12 +192,13 @@ config PPC select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MOD_ARCH_SPECIFIC - select HAVE_NMI if PERF_EVENTS + select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) + select HAVE_HARDLOCKUP_DETECTOR_ARCH if (PPC64 && PPC_BOOK3S) select HAVE_OPROFILE select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index ff1ccb375e60..6f8e79cd35d8 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -1,4 +1,15 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +#ifdef CONFIG_HARDLOCKUP_DETECTOR +extern void arch_touch_nmi_watchdog(void); + +extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace + +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index ebddb2111d87..8ea98504f900 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -55,6 +55,8 @@ struct smp_ops_t { int (*cpu_bootable)(unsigned int nr); }; +extern void smp_flush_nmi_ipi(u64 delay_us); +extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); extern void smp_send_debugger_break(void); extern void start_secondary_resume(void); extern void smp_generic_give_timebase(void); diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 0845eebc5af3..4aa7c147e447 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4c18a5fbb4bb..e6d8354d79ef 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1314,6 +1314,31 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif +#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) + +#define MASKED_DEC_HANDLER_LABEL 3f + +#define MASKED_DEC_HANDLER(_H) \ +3: /* soft-nmi */ \ + std r12,PACA_EXGEN+EX_R12(r13); \ + GET_SCRATCH0(r10); \ + std r10,PACA_EXGEN+EX_R13(r13); \ + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) + +EXC_COMMON_BEGIN(soft_nmi_common) + mr r10,r1 + ld r1,PACAEMERGSP(r13) + ld r1,PACA_NMI_EMERG_SP(r13) + subi r1,r1,INT_FRAME_SIZE + EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, + system_reset, soft_nmi_interrupt, + ADD_NVGPRS;ADD_RECONCILE) + b ret_from_except + +#else +#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ +#define MASKED_DEC_HANDLER(_H) +#endif /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1336,7 +1361,7 @@ masked_##_H##interrupt: \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ - b 2f; \ + b MASKED_DEC_HANDLER_LABEL; \ 1: cmpwi r10,PACA_IRQ_DBELL; \ beq 2f; \ cmpwi r10,PACA_IRQ_HMI; \ @@ -1351,7 +1376,8 @@ masked_##_H##interrupt: \ ld r11,PACA_EXGEN+EX_R11(r13); \ GET_SCRATCH0(r13); \ ##_H##rfid; \ - b . + b .; \ + MASKED_DEC_HANDLER(_H) /* * Real mode exceptions actually use this too, but alternate diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 9ad37f827a97..1086ea37c832 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,6 +25,7 @@ #include #include #include +#include /* hardlockup_detector_disable() */ #include #include @@ -718,6 +719,12 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { + /* + * The hardlockup detector is likely to get false positives in + * KVM guests, so disable it by default. + */ + hardlockup_detector_disable(); + if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 074a075a9cdb..af23d4b576ec 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -751,22 +751,3 @@ unsigned long memory_block_size_bytes(void) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif - -#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF -u64 hw_nmi_get_sample_period(int watchdog_thresh) -{ - return ppc_proc_freq * watchdog_thresh; -} - -/* - * The hardlockup detector breaks PMU event based branches and is likely - * to get false positives in KVM guests, so disable it by default. - */ -static int __init disable_hardlockup_detector(void) -{ - hardlockup_detector_disable(); - - return 0; -} -early_initcall(disable_hardlockup_detector); -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index c6b8bace1766..997c88d54acf 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -435,13 +435,31 @@ static void do_smp_send_nmi_ipi(int cpu) } } +void smp_flush_nmi_ipi(u64 delay_us) +{ + unsigned long flags; + + nmi_ipi_lock_start(&flags); + while (nmi_ipi_busy_count) { + nmi_ipi_unlock_end(&flags); + udelay(1); + if (delay_us) { + delay_us--; + if (!delay_us) + return; + } + nmi_ipi_lock_start(&flags); + } + nmi_ipi_unlock_end(&flags); +} + /* * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to * enter the handler, == 0 specifies indefinite delay. */ -static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) +int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) { unsigned long flags; int me = raw_smp_processor_id(); diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c new file mode 100644 index 000000000000..4ed9cf37f369 --- /dev/null +++ b/arch/powerpc/kernel/watchdog.c @@ -0,0 +1,388 @@ +/* + * Watchdog support on powerpc systems. + * + * Copyright 2017, IBM Corporation. + * + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The watchdog has a simple timer that runs on each CPU, once per timer + * period. This is the heartbeat. + * + * Then there are checks to see if the heartbeat has not triggered on a CPU + * for the panic timeout period. Currently the watchdog only supports an + * SMP check, so the heartbeat only turns on when we have 2 or more CPUs. + * + * This is not an NMI watchdog, but Linux uses that name for a generic + * watchdog in some cases, so NMI gets used in some places. + */ + +static cpumask_t wd_cpus_enabled __read_mostly; + +static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */ +static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ + +static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ + +static DEFINE_PER_CPU(struct timer_list, wd_timer); +static DEFINE_PER_CPU(u64, wd_timer_tb); + +/* + * These are for the SMP checker. CPUs clear their pending bit in their + * heartbeat. If the bitmask becomes empty, the time is noted and the + * bitmask is refilled. + * + * All CPUs clear their bit in the pending mask every timer period. + * Once all have cleared, the time is noted and the bits are reset. + * If the time since all clear was greater than the panic timeout, + * we can panic with the list of stuck CPUs. + * + * This will work best with NMI IPIs for crash code so the stuck CPUs + * can be pulled out to get their backtraces. + */ +static unsigned long __wd_smp_lock = 0; +static cpumask_t wd_smp_cpus_pending; +static cpumask_t wd_smp_cpus_stuck; +static u64 wd_smp_last_reset_tb; + +static inline void wd_smp_lock(unsigned long *flags) +{ + /* + * Avoid locking layers if possible. + * This may be called from low level interrupt handlers at some + * point in future. + */ + local_irq_save(*flags); + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) + cpu_relax(); +} + +static inline void wd_smp_unlock(unsigned long *flags) +{ + clear_bit_unlock(0, &__wd_smp_lock); + local_irq_restore(*flags); +} + +static void wd_lockup_ipi(struct pt_regs *regs) +{ + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", raw_smp_processor_id()); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); +} + +static void set_cpu_stuck(int cpu, u64 tb) +{ + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } +} + +static void watchdog_smp_panic(int cpu, u64 tb) +{ + unsigned long flags; + int c; + + wd_smp_lock(&flags); + /* Double check some things under lock */ + if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + goto out; + if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) + goto out; + if (cpumask_weight(&wd_smp_cpus_pending) == 0) + goto out; + + pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", + cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + + /* + * Try to trigger the stuck CPUs. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); + + /* Take the stuck CPU out of the watch group */ + for_each_cpu(c, &wd_smp_cpus_pending) + set_cpu_stuck(c, tb); + +out: + wd_smp_unlock(&flags); + + printk_safe_flush(); + /* + * printk_safe_flush() seems to require another print + * before anything actually goes out to console. + */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(NULL, "Hard LOCKUP"); +} + +static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +{ + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { + if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { + unsigned long flags; + + pr_emerg("Watchdog CPU:%d became unstuck\n", cpu); + wd_smp_lock(&flags); + cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); + wd_smp_unlock(&flags); + } + return; + } + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + unsigned long flags; + + wd_smp_lock(&flags); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } + wd_smp_unlock(&flags); + } +} + +static void watchdog_timer_interrupt(int cpu) +{ + u64 tb = get_tb(); + + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_clear_cpu_pending(cpu, tb); + + if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) + watchdog_smp_panic(cpu, tb); +} + +void soft_nmi_interrupt(struct pt_regs *regs) +{ + unsigned long flags; + int cpu = raw_smp_processor_id(); + u64 tb; + + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return; + + nmi_enter(); + tb = get_tb(); + if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_lock(&flags); + if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { + wd_smp_unlock(&flags); + goto out; + } + set_cpu_stuck(cpu, tb); + + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + wd_smp_unlock(&flags); + + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + } + if (wd_panic_timeout_tb < 0x7fffffff) + mtspr(SPRN_DEC, wd_panic_timeout_tb); + +out: + nmi_exit(); +} + +static void wd_timer_reset(unsigned int cpu, struct timer_list *t) +{ + t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms); + if (wd_timer_period_ms > 1000) + t->expires = __round_jiffies_up(t->expires, cpu); + add_timer_on(t, cpu); +} + +static void wd_timer_fn(unsigned long data) +{ + struct timer_list *t = this_cpu_ptr(&wd_timer); + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); + + wd_timer_reset(cpu, t); +} + +void arch_touch_nmi_watchdog(void) +{ + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +static void start_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + per_cpu(wd_timer_tb, cpu) = get_tb(); + + setup_pinned_timer(t, wd_timer_fn, 0); + wd_timer_reset(cpu, t); +} + +static void stop_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + del_timer_sync(t); +} + +static int start_wd_on_cpu(unsigned int cpu) +{ + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { + WARN_ON(1); + return 0; + } + + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return 0; + + cpumask_set_cpu(cpu, &wd_cpus_enabled); + if (cpumask_weight(&wd_cpus_enabled) == 1) { + cpumask_set_cpu(cpu, &wd_smp_cpus_pending); + wd_smp_last_reset_tb = get_tb(); + } + smp_wmb(); + start_watchdog_timer_on(cpu); + + return 0; +} + +static int stop_wd_on_cpu(unsigned int cpu) +{ + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) { + WARN_ON(1); + return 0; + } + + stop_watchdog_timer_on(cpu); + + cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_clear_cpu_pending(cpu, get_tb()); + + return 0; +} + +static void watchdog_calc_timeouts(void) +{ + wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + + /* Have the SMP detector trigger a bit later */ + wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; + + /* 2/5 is the factor that the perf based detector uses */ + wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; +} + +void watchdog_nmi_reconfigure(void) +{ + int cpu; + + watchdog_calc_timeouts(); + + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + if (watchdog_suspended) + return; + + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); +} + +/* + * This runs after lockup_detector_init() which sets up watchdog_cpumask. + */ +static int __init powerpc_watchdog_init(void) +{ + int err; + + watchdog_calc_timeouts(); + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) + pr_warning("Watchdog could not be initialized"); + + return 0; +} +arch_initcall(powerpc_watchdog_init); + +static void handle_backtrace_ipi(struct pt_regs *regs) +{ + nmi_cpu_backtrace(regs); +} + +static void raise_backtrace_ipi(cpumask_t *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu == smp_processor_id()) + handle_backtrace_ipi(NULL); + else + smp_send_nmi_ipi(cpu, handle_backtrace_ipi, 1000000); + } +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); +} -- cgit v1.2.3 From 2c54f7900db0853c425649e4c103a724ef8a6f14 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: powerpc/64s: watchdog honor watchdog disable at boot/hotplug The boot/hotplug paths were not taking into account if the watchdog was disabled or suspended. Currently only the watchdog reconfiguration path is. Link: http://lkml.kernel.org/r/20170621001346.5bb337c9@roar.ozlabs.ibm.com Signed-off-by: Nicholas Piggin Cc: Don Zickus Cc: Babu Moger Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton --- arch/powerpc/kernel/watchdog.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 4ed9cf37f369..d46040c0da40 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -288,6 +288,12 @@ static int start_wd_on_cpu(unsigned int cpu) return 0; } + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return 0; + + if (watchdog_suspended) + return 0; + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; @@ -337,12 +343,6 @@ void watchdog_nmi_reconfigure(void) for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - return; - - if (watchdog_suspended) - return; - for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); } -- cgit v1.2.3 From d7e7e34b4505652fe38aa8a8197c79035e40dae2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: powerpc/64s: watchdog false positive warning at CPU unplug CPU unplug will call stop_wd_on_cpu regardless if the watchdog has been configured to be enabled on that CPU. Don't warn in the case it's not in our enabled mask, this is a valid case. Fixes: powerpc-64s-implement-arch-specific-hardlockup-watchdog.patch Link: http://lkml.kernel.org/r/20170630080740.20766-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Santosh Sivaraj Acked-by: Michael Ellerman Cc: Don Zickus Signed-off-by: Andrew Morton --- arch/powerpc/kernel/watchdog.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index d46040c0da40..518afd1feda7 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -310,10 +310,8 @@ static int start_wd_on_cpu(unsigned int cpu) static int stop_wd_on_cpu(unsigned int cpu) { - if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) { - WARN_ON(1); - return 0; - } + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return 0; /* Can happen in CPU unplug case */ stop_watchdog_timer_on(cpu); -- cgit v1.2.3 From c4a5dfce7872ea5338d47e5c3be0414ffa59a851 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 11 Jul 2017 09:23:24 +1000 Subject: powerpc-64s-implement-arch-specific-hardlockup-watchdog-checkpatch-fixes WARNING: line over 80 characters #97: FILE: arch/powerpc/include/asm/smp.h:59: +extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); WARNING: line over 80 characters #119: FILE: arch/powerpc/kernel/exceptions-64s.S:1276: +#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) ERROR: do not initialise statics to 0 #317: FILE: arch/powerpc/kernel/watchdog.c:62: +static unsigned long __wd_smp_lock = 0; WARNING: memory barrier without comment #554: FILE: arch/powerpc/kernel/watchdog.c:299: + smp_wmb(); WARNING: Prefer pr_warn(... to pr_warning(... #617: FILE: arch/powerpc/kernel/watchdog.c:362: + pr_warning("Watchdog could not be initialized"); total: 1 errors, 4 warnings, 543 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/powerpc-64s-implement-arch-specific-hardlockup-watchdog.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 518afd1feda7..b67f8b03a32d 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -59,7 +59,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb); * This will work best with NMI IPIs for crash code so the stuck CPUs * can be pulled out to get their backtraces. */ -static unsigned long __wd_smp_lock = 0; +static unsigned long __wd_smp_lock; static cpumask_t wd_smp_cpus_pending; static cpumask_t wd_smp_cpus_stuck; static u64 wd_smp_last_reset_tb; @@ -357,7 +357,7 @@ static int __init powerpc_watchdog_init(void) err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", start_wd_on_cpu, stop_wd_on_cpu); if (err < 0) - pr_warning("Watchdog could not be initialized"); + pr_warn("Watchdog could not be initialized"); return 0; } -- cgit v1.2.3 From 1539b2f90f3dae98455c800136359ad7d05a0e28 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: efi: avoid fortify checks in EFI stub This avoids CONFIG_FORTIFY_SOURCE from being enabled during the EFI stub build, as adding a panic() implementation may not work well. This can be adjusted in the future. Link: http://lkml.kernel.org/r/1497903987-21002-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Suggested-by: Daniel Micay Reviewed-by: Ard Biesheuvel Acked-by: Mark Rutland Cc: Matt Fleming Signed-off-by: Andrew Morton --- drivers/firmware/efi/libstub/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index f7425960f6a5..37e24f525162 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -17,6 +17,7 @@ cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ cflags-$(CONFIG_EFI_ARMSTUB) += -I$(srctree)/scripts/dtc/libfdt KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ + -D__NO_FORTIFY \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) -- cgit v1.2.3 From 0e00f9638614b1551dc6e8c02df6bf8ea4c68768 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: kexec_file: adjust declaration of kexec_purgatory Defining kexec_purgatory as a zero-length char array upsets compile time size checking. Since this is built on a per-arch basis, define it as an unsized char array (like is done for other similar things, e.g. linker sections). This silences the warning generated by the future CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte" array. This drops the __weak and uses an extern instead, since both users define kexec_purgatory. Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: "Eric W. Biederman" Cc: Daniel Micay Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 7 ------- kernel/kexec_internal.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c8f7f77e9fa9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ -- cgit v1.2.3 From ffda583f4719e25e2a15adc2fb6fb11eb8889306 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: IB/rxe: do not copy extra stack memory to skb This fixes a over-read condition detected by FORTIFY_SOURCE for this line: memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb)); The error was: In file included from ./include/linux/bitmap.h:8:0, from ./include/linux/cpumask.h:11, from ./include/linux/mm_types_task.h:13, from ./include/linux/mm_types.h:4, from ./include/linux/kmemcheck.h:4, from ./include/linux/skbuff.h:18, from drivers/infiniband/sw/rxe/rxe_resp.c:34: In function 'memcpy', inlined from 'send_atomic_ack.constprop' at drivers/infiniband/sw/rxe/rxe_resp.c:998:2, inlined from 'acknowledge' at drivers/infiniband/sw/rxe/rxe_resp.c:1026:3, inlined from 'rxe_responder' at drivers/infiniband/sw/rxe/rxe_resp.c:1286:10: ./include/linux/string.h:309:4: error: call to '__read_overflow2' declared with attribute error: detected read beyond size of object passed as 2nd parameter __read_overflow2(); Daniel Micay noted that struct rxe_pkt_info is 32 bytes on 32-bit architectures, but skb->cb is still 64. The memcpy() over-reads 32 bytes. This fixes it by zeroing the unused bytes in skb->cb. Link: http://lkml.kernel.org/r/1497903987-21002-5-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Moni Shoua Cc: Doug Ledford Cc: Sean Hefty Cc: Daniel Micay Signed-off-by: Andrew Morton --- drivers/infiniband/sw/rxe/rxe_resp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 23039768f541..be944d5aa9af 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -995,7 +995,9 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, free_rd_atomic_resource(qp, res); rxe_advance_resp_resource(qp); - memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb)); + memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(ack_pkt)); + memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0, + sizeof(skb->cb) - sizeof(ack_pkt)); res->type = RXE_ATOMIC_MASK; res->atomic.skb = skb; -- cgit v1.2.3 From fd0df95a94828e221472c44e7313787d9da87fb8 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: powerpc: don't fortify prom_init prom_init is a bit special; in theory it should be able to be linked separately to the kernel. To keep this from getting too complex, the symbols that prom_init.c uses are checked. Fortification adds symbols, and it gets quite messy as it includes things like panic(). So just don't fortify prom_init.c for now. Link: http://lkml.kernel.org/r/1497903987-21002-6-git-send-email-keescook@chromium.org Signed-off-by: Daniel Axtens Signed-off-by: Kees Cook Acked-by: Michael Ellerman Cc: Daniel Micay Signed-off-by: Andrew Morton --- arch/powerpc/kernel/prom_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index dd8a04f3053a..613f79f03877 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -15,6 +15,9 @@ #undef DEBUG_PROM +/* we cannot use FORTIFY as it brings in new symbols */ +#define __NO_FORTIFY + #include #include #include -- cgit v1.2.3 From 5368b4cf4f1701bb18104bb726fb4242b824d865 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: powerpc: make feature-fixup tests fortify-safe Testing the fortified string functions[1] would cause a kernel panic on boot in test_feature_fixups() due to a buffer overflow in memcmp. This boils down to things like this: extern unsigned int ftr_fixup_test1; extern unsigned int ftr_fixup_test1_orig; check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); We know that these are asm labels so it is safe to read up to 'size' bytes at those addresses. However, because we have passed the address of a single unsigned int to memcmp, the compiler believes the underlying object is in fact a single unsigned int. So if size > sizeof(unsigned int), there will be a panic at runtime. We can fix this by changing the types: instead of calling the asm labels unsigned ints, call them unsigned int[]s. Therefore the size isn't incorrectly determined at compile time and we get a regular unsafe memcmp and no panic. [1] http://openwall.com/lists/kernel-hardening/2017/05/09/2 Link: http://lkml.kernel.org/r/1497903987-21002-7-git-send-email-keescook@chromium.org Signed-off-by: Daniel Axtens Signed-off-by: Kees Cook Suggested-by: Michael Ellerman Tested-by: Andrew Donnellan Reviewed-by: Andrew Donnellan Cc: Kees Cook Cc: Daniel Micay Signed-off-by: Andrew Morton --- arch/powerpc/lib/feature-fixups.c | 180 +++++++++++++++++++------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index f3917705c686..41cf5ae273cf 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -233,192 +233,192 @@ static long calc_offset(struct fixup_entry *entry, unsigned int *p) static void test_basic_patching(void) { - extern unsigned int ftr_fixup_test1; - extern unsigned int end_ftr_fixup_test1; - extern unsigned int ftr_fixup_test1_orig; - extern unsigned int ftr_fixup_test1_expected; - int size = &end_ftr_fixup_test1 - &ftr_fixup_test1; + extern unsigned int ftr_fixup_test1[]; + extern unsigned int end_ftr_fixup_test1[]; + extern unsigned int ftr_fixup_test1_orig[]; + extern unsigned int ftr_fixup_test1_expected[]; + int size = end_ftr_fixup_test1 - ftr_fixup_test1; fixup.value = fixup.mask = 8; - fixup.start_off = calc_offset(&fixup, &ftr_fixup_test1 + 1); - fixup.end_off = calc_offset(&fixup, &ftr_fixup_test1 + 2); + fixup.start_off = calc_offset(&fixup, ftr_fixup_test1 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test1 + 2); fixup.alt_start_off = fixup.alt_end_off = 0; /* Sanity check */ - check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); /* Check we don't patch if the value matches */ patch_feature_section(8, &fixup); - check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); /* Check we do patch if the value doesn't match */ patch_feature_section(0, &fixup); - check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_expected, size) == 0); /* Check we do patch if the mask doesn't match */ - memcpy(&ftr_fixup_test1, &ftr_fixup_test1_orig, size); - check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + memcpy(ftr_fixup_test1, ftr_fixup_test1_orig, size); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_orig, size) == 0); patch_feature_section(~8, &fixup); - check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); + check(memcmp(ftr_fixup_test1, ftr_fixup_test1_expected, size) == 0); } static void test_alternative_patching(void) { - extern unsigned int ftr_fixup_test2; - extern unsigned int end_ftr_fixup_test2; - extern unsigned int ftr_fixup_test2_orig; - extern unsigned int ftr_fixup_test2_alt; - extern unsigned int ftr_fixup_test2_expected; - int size = &end_ftr_fixup_test2 - &ftr_fixup_test2; + extern unsigned int ftr_fixup_test2[]; + extern unsigned int end_ftr_fixup_test2[]; + extern unsigned int ftr_fixup_test2_orig[]; + extern unsigned int ftr_fixup_test2_alt[]; + extern unsigned int ftr_fixup_test2_expected[]; + int size = end_ftr_fixup_test2 - ftr_fixup_test2; fixup.value = fixup.mask = 0xF; - fixup.start_off = calc_offset(&fixup, &ftr_fixup_test2 + 1); - fixup.end_off = calc_offset(&fixup, &ftr_fixup_test2 + 2); - fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test2_alt); - fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test2_alt + 1); + fixup.start_off = calc_offset(&fixup, ftr_fixup_test2 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test2 + 2); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test2_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test2_alt + 1); /* Sanity check */ - check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); /* Check we don't patch if the value matches */ patch_feature_section(0xF, &fixup); - check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); /* Check we do patch if the value doesn't match */ patch_feature_section(0, &fixup); - check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_expected, size) == 0); /* Check we do patch if the mask doesn't match */ - memcpy(&ftr_fixup_test2, &ftr_fixup_test2_orig, size); - check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + memcpy(ftr_fixup_test2, ftr_fixup_test2_orig, size); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_orig, size) == 0); patch_feature_section(~0xF, &fixup); - check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); + check(memcmp(ftr_fixup_test2, ftr_fixup_test2_expected, size) == 0); } static void test_alternative_case_too_big(void) { - extern unsigned int ftr_fixup_test3; - extern unsigned int end_ftr_fixup_test3; - extern unsigned int ftr_fixup_test3_orig; - extern unsigned int ftr_fixup_test3_alt; - int size = &end_ftr_fixup_test3 - &ftr_fixup_test3; + extern unsigned int ftr_fixup_test3[]; + extern unsigned int end_ftr_fixup_test3[]; + extern unsigned int ftr_fixup_test3_orig[]; + extern unsigned int ftr_fixup_test3_alt[]; + int size = end_ftr_fixup_test3 - ftr_fixup_test3; fixup.value = fixup.mask = 0xC; - fixup.start_off = calc_offset(&fixup, &ftr_fixup_test3 + 1); - fixup.end_off = calc_offset(&fixup, &ftr_fixup_test3 + 2); - fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test3_alt); - fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test3_alt + 2); + fixup.start_off = calc_offset(&fixup, ftr_fixup_test3 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test3 + 2); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test3_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test3_alt + 2); /* Sanity check */ - check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); /* Expect nothing to be patched, and the error returned to us */ check(patch_feature_section(0xF, &fixup) == 1); - check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); check(patch_feature_section(0, &fixup) == 1); - check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); check(patch_feature_section(~0xF, &fixup) == 1); - check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(memcmp(ftr_fixup_test3, ftr_fixup_test3_orig, size) == 0); } static void test_alternative_case_too_small(void) { - extern unsigned int ftr_fixup_test4; - extern unsigned int end_ftr_fixup_test4; - extern unsigned int ftr_fixup_test4_orig; - extern unsigned int ftr_fixup_test4_alt; - extern unsigned int ftr_fixup_test4_expected; - int size = &end_ftr_fixup_test4 - &ftr_fixup_test4; + extern unsigned int ftr_fixup_test4[]; + extern unsigned int end_ftr_fixup_test4[]; + extern unsigned int ftr_fixup_test4_orig[]; + extern unsigned int ftr_fixup_test4_alt[]; + extern unsigned int ftr_fixup_test4_expected[]; + int size = end_ftr_fixup_test4 - ftr_fixup_test4; unsigned long flag; /* Check a high-bit flag */ flag = 1UL << ((sizeof(unsigned long) - 1) * 8); fixup.value = fixup.mask = flag; - fixup.start_off = calc_offset(&fixup, &ftr_fixup_test4 + 1); - fixup.end_off = calc_offset(&fixup, &ftr_fixup_test4 + 5); - fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test4_alt); - fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test4_alt + 2); + fixup.start_off = calc_offset(&fixup, ftr_fixup_test4 + 1); + fixup.end_off = calc_offset(&fixup, ftr_fixup_test4 + 5); + fixup.alt_start_off = calc_offset(&fixup, ftr_fixup_test4_alt); + fixup.alt_end_off = calc_offset(&fixup, ftr_fixup_test4_alt + 2); /* Sanity check */ - check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); /* Check we don't patch if the value matches */ patch_feature_section(flag, &fixup); - check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); /* Check we do patch if the value doesn't match */ patch_feature_section(0, &fixup); - check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_expected, size) == 0); /* Check we do patch if the mask doesn't match */ - memcpy(&ftr_fixup_test4, &ftr_fixup_test4_orig, size); - check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + memcpy(ftr_fixup_test4, ftr_fixup_test4_orig, size); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_orig, size) == 0); patch_feature_section(~flag, &fixup); - check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); + check(memcmp(ftr_fixup_test4, ftr_fixup_test4_expected, size) == 0); } static void test_alternative_case_with_branch(void) { - extern unsigned int ftr_fixup_test5; - extern unsigned int end_ftr_fixup_test5; - extern unsigned int ftr_fixup_test5_expected; - int size = &end_ftr_fixup_test5 - &ftr_fixup_test5; + extern unsigned int ftr_fixup_test5[]; + extern unsigned int end_ftr_fixup_test5[]; + extern unsigned int ftr_fixup_test5_expected[]; + int size = end_ftr_fixup_test5 - ftr_fixup_test5; - check(memcmp(&ftr_fixup_test5, &ftr_fixup_test5_expected, size) == 0); + check(memcmp(ftr_fixup_test5, ftr_fixup_test5_expected, size) == 0); } static void test_alternative_case_with_external_branch(void) { - extern unsigned int ftr_fixup_test6; - extern unsigned int end_ftr_fixup_test6; - extern unsigned int ftr_fixup_test6_expected; - int size = &end_ftr_fixup_test6 - &ftr_fixup_test6; + extern unsigned int ftr_fixup_test6[]; + extern unsigned int end_ftr_fixup_test6[]; + extern unsigned int ftr_fixup_test6_expected[]; + int size = end_ftr_fixup_test6 - ftr_fixup_test6; - check(memcmp(&ftr_fixup_test6, &ftr_fixup_test6_expected, size) == 0); + check(memcmp(ftr_fixup_test6, ftr_fixup_test6_expected, size) == 0); } static void test_cpu_macros(void) { - extern u8 ftr_fixup_test_FTR_macros; - extern u8 ftr_fixup_test_FTR_macros_expected; - unsigned long size = &ftr_fixup_test_FTR_macros_expected - - &ftr_fixup_test_FTR_macros; + extern u8 ftr_fixup_test_FTR_macros[]; + extern u8 ftr_fixup_test_FTR_macros_expected[]; + unsigned long size = ftr_fixup_test_FTR_macros_expected - + ftr_fixup_test_FTR_macros; /* The fixups have already been done for us during boot */ - check(memcmp(&ftr_fixup_test_FTR_macros, - &ftr_fixup_test_FTR_macros_expected, size) == 0); + check(memcmp(ftr_fixup_test_FTR_macros, + ftr_fixup_test_FTR_macros_expected, size) == 0); } static void test_fw_macros(void) { #ifdef CONFIG_PPC64 - extern u8 ftr_fixup_test_FW_FTR_macros; - extern u8 ftr_fixup_test_FW_FTR_macros_expected; - unsigned long size = &ftr_fixup_test_FW_FTR_macros_expected - - &ftr_fixup_test_FW_FTR_macros; + extern u8 ftr_fixup_test_FW_FTR_macros[]; + extern u8 ftr_fixup_test_FW_FTR_macros_expected[]; + unsigned long size = ftr_fixup_test_FW_FTR_macros_expected - + ftr_fixup_test_FW_FTR_macros; /* The fixups have already been done for us during boot */ - check(memcmp(&ftr_fixup_test_FW_FTR_macros, - &ftr_fixup_test_FW_FTR_macros_expected, size) == 0); + check(memcmp(ftr_fixup_test_FW_FTR_macros, + ftr_fixup_test_FW_FTR_macros_expected, size) == 0); #endif } static void test_lwsync_macros(void) { - extern u8 lwsync_fixup_test; - extern u8 end_lwsync_fixup_test; - extern u8 lwsync_fixup_test_expected_LWSYNC; - extern u8 lwsync_fixup_test_expected_SYNC; - unsigned long size = &end_lwsync_fixup_test - - &lwsync_fixup_test; + extern u8 lwsync_fixup_test[]; + extern u8 end_lwsync_fixup_test[]; + extern u8 lwsync_fixup_test_expected_LWSYNC[]; + extern u8 lwsync_fixup_test_expected_SYNC[]; + unsigned long size = end_lwsync_fixup_test - + lwsync_fixup_test; /* The fixups have already been done for us during boot */ if (cur_cpu_spec->cpu_features & CPU_FTR_LWSYNC) { - check(memcmp(&lwsync_fixup_test, - &lwsync_fixup_test_expected_LWSYNC, size) == 0); + check(memcmp(lwsync_fixup_test, + lwsync_fixup_test_expected_LWSYNC, size) == 0); } else { - check(memcmp(&lwsync_fixup_test, - &lwsync_fixup_test_expected_SYNC, size) == 0); + check(memcmp(lwsync_fixup_test, + lwsync_fixup_test_expected_SYNC, size) == 0); } } -- cgit v1.2.3 From dfc2f1f1b3f5361e7a2b2e9cee66b3dac1fef72d Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: include/linux/string.h: add the option of fortified string.h functions This adds support for compiling with a rough equivalent to the glibc _FORTIFY_SOURCE=1 feature, providing compile-time and runtime buffer overflow checks for string.h functions when the compiler determines the size of the source or destination buffer at compile-time. Unlike glibc, it covers buffer reads in addition to writes. GNU C __builtin_*_chk intrinsics are avoided because they would force a much more complex implementation. They aren't designed to detect read overflows and offer no real benefit when using an implementation based on inline checks. Inline checks don't add up to much code size and allow full use of the regular string intrinsics while avoiding the need for a bunch of _chk functions and per-arch assembly to avoid wrapper overhead. This detects various overflows at compile-time in various drivers and some non-x86 core kernel code. There will likely be issues caught in regular use at runtime too. Future improvements left out of initial implementation for simplicity, as it's all quite optional and can be done incrementally: * Some of the fortified string functions (strncpy, strcat), don't yet place a limit on reads from the source based on __builtin_object_size of the source buffer. * Extending coverage to more string functions like strlcat. * It should be possible to optionally use __builtin_object_size(x, 1) for some functions (C strings) to detect intra-object overflows (like glibc's _FORTIFY_SOURCE=2), but for now this takes the conservative approach to avoid likely compatibility issues. * The compile-time checks should be made available via a separate config option which can be enabled by default (or always enabled) once enough time has passed to get the issues it catches fixed. Kees said: : This is great to have. While it was out-of-tree code, it would have : blocked at least CVE-2016-3858 from being exploitable (improper size : argument to strlcpy()). I've sent a number of fixes for : out-of-bounds-reads that this detected upstream already. [keescook@chromium.org: move from -mm, add ARCH_HAS_FORTIFY_SOURCE, tweak Kconfig help] Link: http://lkml.kernel.org/r/20170526095404.20439-1-danielmicay@gmail.com Link: http://lkml.kernel.org/r/1497903987-21002-8-git-send-email-keescook@chromium.org Signed-off-by: Daniel Micay Signed-off-by: Kees Cook Acked-by: Kees Cook Cc: Mark Rutland Cc: Daniel Axtens Cc: Rasmus Villemoes Cc: Andy Shevchenko Cc: Chris Metcalf Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/Kconfig | 6 ++ arch/arm64/Kconfig | 1 + arch/arm64/include/asm/string.h | 5 + arch/powerpc/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/boot/compressed/misc.c | 5 + arch/x86/include/asm/string_32.h | 9 ++ arch/x86/include/asm/string_64.h | 7 ++ include/linux/string.h | 200 +++++++++++++++++++++++++++++++++++++++ lib/string.c | 6 ++ security/Kconfig | 7 ++ 11 files changed, 248 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index fb9bd7d36b05..21d0089117fe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -223,6 +223,12 @@ config GENERIC_SMP_IDLE_THREAD config GENERIC_IDLE_POLL_SETUP bool +config ARCH_HAS_FORTIFY_SOURCE + bool + help + An architecture should select this when it can successfully + build and run with CONFIG_FORTIFY_SOURCE. + # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h config ARCH_HAS_SET_MEMORY bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8addb851ab5e..dfd908630631 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -12,6 +12,7 @@ config ARM64 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 2eb714c4639f..d0aa42907569 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -63,6 +63,11 @@ extern int memcmp(const void *, const void *, size_t); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fce2f4f20891..36f858c37ca7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -125,6 +125,7 @@ config PPC select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d2b8ce54e00..781521b7cf9e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,7 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 00241c815524..a0838ab929f2 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -411,3 +411,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putstr("done.\nBooting the kernel.\n"); return output; } + +void fortify_panic(const char *name) +{ + error("detected buffer overflow"); +} diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e8353ee5c..e9ee84873de5 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -142,7 +142,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, } #define __HAVE_ARCH_MEMCPY +extern void *memcpy(void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #ifdef CONFIG_X86_USE_3DNOW #include @@ -195,11 +197,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) #endif #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t n); +extern int memcmp(const void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #define memcmp __builtin_memcmp +#endif #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); @@ -321,6 +327,8 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +extern void *memset(void *, int, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #if (__GNUC__ >= 4) #define memset(s, c, count) __builtin_memset(s, c, count) #else @@ -330,6 +338,7 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, (count)) \ : __memset((s), (c), (count))) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ /* * find the first occurrence of byte 'c', or 1 past the area if none diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 1f22bc277c45..2a8c822de1fc 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -31,6 +31,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); +#ifndef CONFIG_FORTIFY_SOURCE #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ @@ -51,6 +52,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); */ #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); @@ -77,6 +79,11 @@ int strcmp(const char *cs, const char *ct); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #define __HAVE_ARCH_MEMCPY_MCSAFE 1 diff --git a/include/linux/string.h b/include/linux/string.h index 84306f5989be..5ec391cc5ddd 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -194,4 +194,204 @@ static inline const char *kbasename(const char *path) return tail ? tail + 1 : path; } +#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline)) +#define __RENAME(x) __asm__(#x) + +void fortify_panic(const char *name) __noreturn __cold; +void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter"); +void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter"); +void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + +#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) +__FORTIFY_INLINE char *strcpy(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + if (strscpy(p, q, p_size < q_size ? p_size : q_size) < 0) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_strncpy(p, q, size); +} + +__FORTIFY_INLINE char *strcat(char *p, const char *q) +{ + size_t p_size = __builtin_object_size(p, 0); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) + fortify_panic(__func__); + return p; +} + +__FORTIFY_INLINE __kernel_size_t strlen(const char *p) +{ + __kernel_size_t ret; + size_t p_size = __builtin_object_size(p, 0); + if (p_size == (size_t)-1) + return __builtin_strlen(p); + ret = strnlen(p, p_size); + if (p_size <= ret) + fortify_panic(__func__); + return ret; +} + +extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) +{ + size_t p_size = __builtin_object_size(p, 0); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); + return ret; +} + +/* defined after fortified strlen to reuse it */ +extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); +__FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) +{ + size_t ret; + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + if (__builtin_constant_p(len) && len >= p_size) + __write_overflow(); + if (len >= p_size) + fortify_panic(__func__); + __builtin_memcpy(p, q, len); + p[len] = '\0'; + } + return ret; +} + +/* defined after fortified strlen and strnlen to reuse them */ +__FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) +{ + size_t p_len, copy_len; + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); + copy_len = strnlen(q, count); + if (p_size < p_len + copy_len + 1) + fortify_panic(__func__); + __builtin_memcpy(p + p_len, q, copy_len); + p[p_len + copy_len] = '\0'; + return p; +} + +__FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_memset(p, c, size); +} + +__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memcpy(p, q, size); +} + +__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __write_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memmove(p, q, size); +} + +extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); +__FORTIFY_INLINE void *memscan(void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memscan(p, c, size); +} + +__FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + size_t q_size = __builtin_object_size(q, 0); + if (__builtin_constant_p(size)) { + if (p_size < size) + __read_overflow(); + if (q_size < size) + __read_overflow2(); + } + if (p_size < size || q_size < size) + fortify_panic(__func__); + return __builtin_memcmp(p, q, size); +} + +__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __builtin_memchr(p, c, size); +} + +void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); +__FORTIFY_INLINE void *memchr_inv(const void *p, int c, size_t size) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_memchr_inv(p, c, size); +} + +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup); +__FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) +{ + size_t p_size = __builtin_object_size(p, 0); + if (__builtin_constant_p(size) && p_size < size) + __read_overflow(); + if (p_size < size) + fortify_panic(__func__); + return __real_kmemdup(p, size, gfp); +} +#endif + #endif /* _LINUX_STRING_H_ */ diff --git a/lib/string.c b/lib/string.c index 1c1fc9187b05..a6ee1955a701 100644 --- a/lib/string.c +++ b/lib/string.c @@ -978,3 +978,9 @@ char *strreplace(char *s, char old, char new) return s; } EXPORT_SYMBOL(strreplace); + +void fortify_panic(const char *name) +{ + panic("detected buffer overflow in %s", name); +} +EXPORT_SYMBOL(fortify_panic); diff --git a/security/Kconfig b/security/Kconfig index d540bfe73190..e8e449444e65 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -163,6 +163,13 @@ config HARDENED_USERCOPY_PAGESPAN been removed. This config is intended to be used only while trying to find such users. +config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + config STATIC_USERMODEHELPER bool "Force all usermode helper calls through a single binary" help -- cgit v1.2.3 From d9ad09b9fadacc150e432bddbbc4a641662ff4e9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: x86: fix fortified memcpy __memcpy3d is not available when CONFIG_FORTIFY_SOURCE is enabled, leading to a link error on 32-bit kernels: arch/x86/lib/memcpy_32.c: In function 'memcpy': arch/x86/lib/memcpy_32.c:10:9: error: implicit declaration of function '__memcpy3d'; did you mean '__memcpy'? [-Werror=implicit-function-declaration] This uses the same #ifdef around the use of the function that now protects the definition. Fixes: mmotm ("include/linux/string.h: add the option of fortified string.h functions") Link: http://lkml.kernel.org/r/20170627150047.660360-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Daniel Micay Cc: Kees Cook Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/lib/memcpy_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index cad12634d6bd..2eab7d0bfedd 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -6,7 +6,7 @@ __visible void *memcpy(void *to, const void *from, size_t n) { -#ifdef CONFIG_X86_USE_3DNOW +#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE) return __memcpy3d(to, from, n); #else return __memcpy(to, from, n); -- cgit v1.2.3 From 90fa41502473981b2cb3cd65b59b42e216aa86c6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Jul 2017 09:23:25 +1000 Subject: fortify: avoid panic() in favor of BUG() Since detection of a given fortify failure is sufficient to stop the memory corruption from happening, it doesn't make sense to unconditionally bring down the entire system. Instead, use BUG() which will stop the bad thread of kernel execution (and only optionally panic the system). Link: http://lkml.kernel.org/r/20170626235122.GA25261@beast Signed-off-by: Kees Cook Cc: Daniel Micay Signed-off-by: Andrew Morton --- lib/string.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/string.c b/lib/string.c index a6ee1955a701..ebbb99c775bd 100644 --- a/lib/string.c +++ b/lib/string.c @@ -981,6 +981,7 @@ EXPORT_SYMBOL(strreplace); void fortify_panic(const char *name) { - panic("detected buffer overflow in %s", name); + pr_emerg("detected buffer overflow in %s\n", name); + BUG(); } EXPORT_SYMBOL(fortify_panic); -- cgit v1.2.3 From a48367eaad3e27622d2a52142db0afcc67961eda Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: sh: mark end of BUG() implementation as unreachable When building the sh architecture, the compiler doesn't realize that BUG() doesn't return, so it will complain about functions using BUG() that are marked with the noreturn attribute: lib/string.c: In function 'fortify_panic': >> lib/string.c:986:1: warning: 'noreturn' function does return } ^ Link: http://lkml.kernel.org/r/20170627192050.GA66784@beast Signed-off-by: Kees Cook Cc: Yoshinori Sato Cc: Rich Felker Cc: Daniel Micay Signed-off-by: Andrew Morton --- arch/sh/include/asm/bug.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h index 1b77f068be2b..c9828f785ca0 100644 --- a/arch/sh/include/asm/bug.h +++ b/arch/sh/include/asm/bug.h @@ -48,6 +48,7 @@ do { \ "i" (__FILE__), \ "i" (__LINE__), "i" (0), \ "i" (sizeof(struct bug_entry))); \ + unreachable(); \ } while (0) #define __WARN_FLAGS(flags) \ -- cgit v1.2.3 From ed68195ac62e389424090ab2cd37072733442010 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: random,stackprotect: introduce get_random_canary function Patch series "stackprotector: ascii armor the stack canary", v2. Zero out the first byte of the stack canary value on 64 bit systems, in order to mitigate unterminated C string overflows. The null byte both prevents C string functions from reading the canary, and from writing it if the canary value were guessed or obtained through some other means. Reducing the entropy by 8 bits is acceptable on 64-bit systems, which will still have 56 bits of entropy left, but not on 32 bit systems, so the "ascii armor" canary is only implemented on 64-bit systems. Inspired by the "ascii armor" code in execshield and Daniel Micay's linux-hardened tree. Also see https://github.com/thestinger/linux-hardened/ This patch (of 5): Introduce get_random_canary(), which provides a random unsigned long canary value with the first byte zeroed out on 64 bit architectures, in order to mitigate non-terminated C string overflows. The null byte both prevents C string functions from reading the canary, and from writing it if the canary value were guessed or obtained through some other means. Reducing the entropy by 8 bits is acceptable on 64-bit systems, which will still have 56 bits of entropy left, but not on 32 bit systems, so the "ascii armor" canary is only implemented on 64-bit systems. Inspired by the "ascii armor" code in the old execshield patches, and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-2-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/random.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/random.h b/include/linux/random.h index 4aecc339558d..7c446cb3f91a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -83,6 +83,27 @@ declare_get_random_var_wait(int) declare_get_random_var_wait(long) #undef declare_get_random_var +/* + * On 64-bit architectures, protect against non-terminated C string overflows + * by zeroing out the first byte of the canary; this leaves 56 bits of entropy. + */ +#ifdef CONFIG_64BIT +# ifdef __LITTLE_ENDIAN +# define CANARY_MASK 0xffffffffffffff00UL +# else /* big endian, 64 bits: */ +# define CANARY_MASK 0x00ffffffffffffffUL +# endif +#else /* 32 bits: */ +# define CANARY_MASK 0xffffffffUL +#endif + +static inline unsigned long get_random_canary(void) +{ + unsigned long val = get_random_long(); + + return val & CANARY_MASK; +} + unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); -- cgit v1.2.3 From e730121ad70289f6d051041cd45f1dd0d8b3bfde Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: fork,random: use get_random_canary() to set tsk->stack_canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index ade237a96308..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* -- cgit v1.2.3 From 224a354df3c300a41e9c051946917bd2ae435fa7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: x86: ascii armor the x86_64 boot init stack canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-4-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/include/asm/stackprotector.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index dcbd9bcce714..8abedf1d650e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -74,6 +74,7 @@ static __always_inline void boot_init_stack_canary(void) get_random_bytes(&canary, sizeof(canary)); tsc = rdtsc(); canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; current->stack_canary = canary; #ifdef CONFIG_X86_64 -- cgit v1.2.3 From e4bd3692fbdef8c36dcb3e3c1ea641380c7f1e13 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: arm64: ascii armor the arm64 boot init stack canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-5-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/stackprotector.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/stackprotector.h b/arch/arm64/include/asm/stackprotector.h index fe5e287dc56b..b86a0865ddf1 100644 --- a/arch/arm64/include/asm/stackprotector.h +++ b/arch/arm64/include/asm/stackprotector.h @@ -30,6 +30,7 @@ static __always_inline void boot_init_stack_canary(void) /* Try to get a semi random initial value. */ get_random_bytes(&canary, sizeof(canary)); canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; current->stack_canary = canary; __stack_chk_guard = current->stack_canary; -- cgit v1.2.3 From 41bbe7de91ba8a456bf5a3d38ed643e2e392e1b5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: sh64: ascii armor the sh64 boot init stack canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524123446.78510066@annuminas.surriel.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/include/asm/stackprotector.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/include/asm/stackprotector.h b/arch/sh/include/asm/stackprotector.h index d9df3a76847c..141515a43b78 100644 --- a/arch/sh/include/asm/stackprotector.h +++ b/arch/sh/include/asm/stackprotector.h @@ -19,6 +19,7 @@ static __always_inline void boot_init_stack_canary(void) /* Try to get a semi random initial value. */ get_random_bytes(&canary, sizeof(canary)); canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; current->stack_canary = canary; __stack_chk_guard = current->stack_canary; -- cgit v1.2.3 From c6cac2a2618060c2917a273b931a44da977ee402 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:26 +1000 Subject: x86/mmap: properly account for stack randomization in mmap_base When RLIMIT_STACK is, for example, 256MB, the current code results in a gap between the top of the task and mmap_base of 256MB, failing to take into account the amount by which the stack address was randomized. In other words, the stack gets less than RLIMIT_STACK space. Ensure that the gap between the stack and mmap_base always takes stack randomization and the stack guard gap into account. Obtained from Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170622200033.25714-2-riel@redhat.com Signed-off-by: Daniel Micay Signed-off-by: Rik van Riel Reported-by: Florian Weimer Acked-by: Ingo Molnar Cc: Will Deacon Cc: Daniel Micay Cc: Benjamin Herrenschmidt Cc: Hugh Dickins Cc: Catalin Marinas Signed-off-by: Andrew Morton --- arch/x86/mm/mmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 797295e792b2..229d04a83f85 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -92,13 +92,18 @@ unsigned long arch_mmap_rnd(void) static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + /* * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_min = SIZE_128M; gap_max = (task_size / 6) * 5; if (gap < gap_min) -- cgit v1.2.3 From 73681b227d2a0b99d386e559de529d4b2169dc3c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: arm64/mmap: properly account for stack randomization in mmap_base When RLIMIT_STACK is, for example, 256MB, the current code results in a gap between the top of the task and mmap_base of 256MB, failing to take into account the amount by which the stack address was randomized. In other words, the stack gets less than RLIMIT_STACK space. Ensure that the gap between the stack and mmap_base always takes stack randomization and the stack guard gap into account. Obtained from Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170622200033.25714-3-riel@redhat.com Signed-off-by: Daniel Micay Signed-off-by: Rik van Riel Reported-by: Florian Weimer Cc: Ingo Molnar Cc: Will Deacon Cc: Daniel Micay Cc: Benjamin Herrenschmidt Cc: Hugh Dickins Cc: Catalin Marinas Signed-off-by: Andrew Morton --- arch/arm64/mm/mmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index adc208c2ae9c..decccffb03ca 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -35,7 +35,7 @@ * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ -#define MIN_GAP (SZ_128M + ((STACK_RND_MASK << PAGE_SHIFT) + 1)) +#define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP/6*5) static int mmap_is_legacy(void) @@ -65,6 +65,11 @@ unsigned long arch_mmap_rnd(void) static unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; -- cgit v1.2.3 From c3d4dcd1a5bcc389c25b51afee5ad8f0073e44b1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: powerpc,mmap: properly account for stack randomization in mmap_base When RLIMIT_STACK is, for example, 256MB, the current code results in a gap between the top of the task and mmap_base of 256MB, failing to take into account the amount by which the stack address was randomized. In other words, the stack gets less than RLIMIT_STACK space. Ensure that the gap between the stack and mmap_base always takes stack randomization and the stack guard gap into account. Inspired by Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170622200033.25714-4-riel@redhat.com Signed-off-by: Rik van Riel Reported-by: Florian Weimer Cc: Ingo Molnar Cc: Will Deacon Cc: Daniel Micay Cc: Benjamin Herrenschmidt Cc: Hugh Dickins Cc: Catalin Marinas Signed-off-by: Andrew Morton --- arch/powerpc/mm/mmap.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 0ee6be4f1ba4..5d78b193fec4 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -34,16 +34,9 @@ /* * Top of mmap area (just below the process stack). * - * Leave at least a ~128 MB hole on 32bit applications. - * - * On 64bit applications we randomise the stack by 1GB so we need to - * space our mmap start address by a further 1GB, otherwise there is a - * chance the mmap area will end up closer to the stack than our ulimit - * requires. + * Leave at least a ~128 MB hole. */ -#define MIN_GAP32 (128*1024*1024) -#define MIN_GAP64 ((128 + 1024)*1024*1024UL) -#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64) +#define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) static inline int mmap_is_legacy(void) @@ -71,9 +64,26 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } +static inline unsigned long stack_maxrandom_size(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + return (1<<23); + else + return (1<<30); +} + static inline unsigned long mmap_base(unsigned long rnd) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long pad = stack_maxrandom_size() + stack_guard_gap; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; -- cgit v1.2.3 From e713a468650600f40ff89ff925fa9bb63a2a37c8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: MIPS: do not use __GFP_REPEAT for order-0 request Patch series "mm: give __GFP_REPEAT a better semantic". The main motivation for the change is that the current implementation of __GFP_REPEAT is not very much useful. The documentation says: * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt * _might_ fail. This depends upon the particular VM implementation. It just fails to mention that this is true only for large (costly) high order which has been the case since the flag was introduced. A similar semantic would be really helpful for smal orders as well, though, because we have places where a failure with a specific fallback error handling is preferred to a potential endless loop inside the page allocator. The earlier cleanup dropped __GFP_REPEAT usage for low (!costly) order users so only those which might use larger orders have stayed. One new user added in the meantime is addressed in patch 1. Let's rename the flag to something more verbose and use it for existing users. Semantic for those will not change. Then implement low (!costly) orders failure path which is hit after the page allocator is about to invoke the oom killer. With that we have a good counterpart for __GFP_NORETRY and finally can tell try as hard as possible without the OOM killer. Xfs code already has an existing annotation for allocations which are allowed to fail and we can trivially map them to the new gfp flag because it will provide the semantic KM_MAYFAIL wants. Christoph didn't consider the new flag really necessary but didn't respond to the OOM killer aspect of the change so I have kept the patch. If this is still seen as not really needed I can drop the patch. kvmalloc will allow also !costly high order allocations to retry hard before falling back to the vmalloc. drm/i915 asked for the new semantic explicitly. Memory migration code, especially for the memory hotplug, should back off rather than invoking the OOM killer as well. This patch (of 6): 3377e227af44 ("MIPS: Add 48-bit VA space (and 4-level page tables) for 4K pages.") has added a new __GFP_REPEAT user but using this flag doesn't really make any sense for order-0 request which is the case here because PUD_ORDER is 0. __GFP_REPEAT has historically effect only on allocation requests with order > PAGE_ALLOC_COSTLY_ORDER. This doesn't introduce any functional change. This is a preparatory patch for later work which renames the flag and redefines its semantic. Link: http://lkml.kernel.org/r/20170623085345.11304-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Belits Cc: David Daney Cc: Ralf Baechle Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Christoph Hellwig Cc: Chris Wilson Cc: Darrick J. Wong Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index a1bdb1ea5234..39b9f311c4ef 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -116,7 +116,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; - pud = (pud_t *) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, PUD_ORDER); + pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_ORDER); if (pud) pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table); return pud; -- cgit v1.2.3 From 38dc03261197b9ce6bb1f76fb3b697ec7bfdbe97 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic __GFP_REPEAT was designed to allow retry-but-eventually-fail semantic to the page allocator. This has been true but only for allocations requests larger than PAGE_ALLOC_COSTLY_ORDER. It has been always ignored for smaller sizes. This is a bit unfortunate because there is no way to express the same semantic for those requests and they are considered too important to fail so they might end up looping in the page allocator for ever, similarly to GFP_NOFAIL requests. Now that the whole tree has been cleaned up and accidental or misled usage of __GFP_REPEAT flag has been removed for !costly requests we can give the original flag a better name and more importantly a more useful semantic. Let's rename it to __GFP_RETRY_MAYFAIL which tells the user that the allocator would try really hard but there is no promise of a success. This will work independent of the order and overrides the default allocator behavior. Page allocator users have several levels of guarantee vs. cost options (take GFP_KERNEL as an example) - GFP_KERNEL & ~__GFP_RECLAIM - optimistic allocation without _any_ attempt to free memory at all. The most light weight mode which even doesn't kick the background reclaim. Should be used carefully because it might deplete the memory and the next user might hit the more aggressive reclaim - GFP_KERNEL & ~__GFP_DIRECT_RECLAIM (or GFP_NOWAIT)- optimistic allocation without any attempt to free memory from the current context but can wake kswapd to reclaim memory if the zone is below the low watermark. Can be used from either atomic contexts or when the request is a performance optimization and there is another fallback for a slow path. - (GFP_KERNEL|__GFP_HIGH) & ~__GFP_DIRECT_RECLAIM (aka GFP_ATOMIC) - non sleeping allocation with an expensive fallback so it can access some portion of memory reserves. Usually used from interrupt/bh context with an expensive slow path fallback. - GFP_KERNEL - both background and direct reclaim are allowed and the _default_ page allocator behavior is used. That means that !costly allocation requests are basically nofail but there is no guarantee of that behavior so failures have to be checked properly by callers (e.g. OOM killer victim is allowed to fail currently). - GFP_KERNEL | __GFP_NORETRY - overrides the default allocator behavior and all allocation requests fail early rather than cause disruptive reclaim (one round of reclaim in this implementation). The OOM killer is not invoked. - GFP_KERNEL | __GFP_RETRY_MAYFAIL - overrides the default allocator behavior and all allocation requests try really hard. The request will fail if the reclaim cannot make any progress. The OOM killer won't be triggered. - GFP_KERNEL | __GFP_NOFAIL - overrides the default allocator behavior and all allocation requests will loop endlessly until they succeed. This might be really dangerous especially for larger orders. Existing users of __GFP_REPEAT are changed to __GFP_RETRY_MAYFAIL because they already had their semantic. No new users are added. __alloc_pages_slowpath is changed to bail out for __GFP_RETRY_MAYFAIL if there is no progress and we have already passed the OOM point. This means that all the reclaim opportunities have been exhausted except the most disruptive one (the OOM killer) and a user defined fallback behavior is more sensible than keep retrying in the page allocator. Link: http://lkml.kernel.org/r/20170623085345.11304-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Belits Cc: Chris Wilson Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Signed-off-by: Andrew Morton --- Documentation/DMA-ISA-LPC.txt | 2 +- arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- drivers/mmc/host/wbsd.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/target/target_core_transport.c | 2 +- drivers/vhost/net.c | 2 +- drivers/vhost/scsi.c | 2 +- drivers/vhost/vsock.c | 2 +- include/linux/gfp.h | 55 +++++++++++++++++++++------- include/linux/slab.h | 3 +- include/trace/events/mmflags.h | 2 +- mm/hugetlb.c | 4 +- mm/internal.h | 2 +- mm/page_alloc.c | 14 +++++-- mm/sparse-vmemmap.c | 4 +- mm/util.c | 6 +-- mm/vmalloc.c | 2 +- mm/vmscan.c | 8 ++-- net/core/dev.c | 6 +-- net/core/skbuff.c | 2 +- net/sched/sch_fq.c | 2 +- tools/perf/builtin-kmem.c | 2 +- 23 files changed, 84 insertions(+), 46 deletions(-) diff --git a/Documentation/DMA-ISA-LPC.txt b/Documentation/DMA-ISA-LPC.txt index c41331398752..7a065ac4a9d1 100644 --- a/Documentation/DMA-ISA-LPC.txt +++ b/Documentation/DMA-ISA-LPC.txt @@ -42,7 +42,7 @@ requirements you pass the flag GFP_DMA to kmalloc. Unfortunately the memory available for ISA DMA is scarce so unless you allocate the memory during boot-up it's a good idea to also pass -__GFP_REPEAT and __GFP_NOWARN to make the allocator try a bit harder. +__GFP_RETRY_MAYFAIL and __GFP_NOWARN to make the allocator try a bit harder. (This scarcity also means that you should allocate the buffer as early as possible and not release it until the driver is unloaded.) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 20b1485ff1e8..e2329db9d6f4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -56,7 +56,7 @@ static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) return (pgd_t *)__get_free_page(pgtable_gfp_flags(mm, PGALLOC_GFP)); #else struct page *page; - page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_REPEAT), + page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_RETRY_MAYFAIL), 4); if (!page) return NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 710e491206ed..8cb0190e2a73 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -93,7 +93,7 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) } if (!hpt) - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index e15a9733fcfd..9668616faf16 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -1386,7 +1386,7 @@ static void wbsd_request_dma(struct wbsd_host *host, int dma) * order for ISA to be able to DMA to it. */ host->dma_buffer = kmalloc(WBSD_DMA_SIZE, - GFP_NOIO | GFP_DMA | __GFP_REPEAT | __GFP_NOWARN); + GFP_NOIO | GFP_DMA | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!host->dma_buffer) goto free; diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 65f5a794f26d..98749fa817da 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -98,7 +98,7 @@ vmcp_write(struct file *file, const char __user *buff, size_t count, } if (!session->response) session->response = (char *)__get_free_pages(GFP_KERNEL - | __GFP_REPEAT | GFP_DMA, + | __GFP_RETRY_MAYFAIL | GFP_DMA, get_order(session->bufsize)); if (!session->response) { mutex_unlock(&session->mutex); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 8735243edf04..97fed9a298bd 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -252,7 +252,7 @@ int transport_alloc_session_tags(struct se_session *se_sess, int rc; se_sess->sess_cmd_map = kzalloc(tag_num * tag_size, - GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); if (!se_sess->sess_cmd_map) { se_sess->sess_cmd_map = vzalloc(tag_num * tag_size); if (!se_sess->sess_cmd_map) { diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e3d7ea1288c6..06d044862e58 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -897,7 +897,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) struct sk_buff **queue; int i; - n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_REPEAT); + n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!n) return -ENOMEM; vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 679f8960db4b..046f6d280af5 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1399,7 +1399,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) struct vhost_virtqueue **vqs; int r = -ENOMEM, i; - vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL); if (!vs) { vs = vzalloc(sizeof(*vs)); if (!vs) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 3f63e03de8e8..c9de9c41aa97 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -508,7 +508,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ - vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_REPEAT); + vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4c6656f1fee7..6be1f836b69e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -25,7 +25,7 @@ struct vm_area_struct; #define ___GFP_FS 0x80u #define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u -#define ___GFP_REPEAT 0x400u +#define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u #define ___GFP_NORETRY 0x1000u #define ___GFP_MEMALLOC 0x2000u @@ -136,26 +136,55 @@ struct vm_area_struct; * * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * - * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. + * The default allocator behavior depends on the request size. We have a concept + * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER). + * !costly allocations are too essential to fail so they are implicitly + * non-failing (with some exceptions like OOM victims might fail) by default while + * costly requests try to be not disruptive and back off even without invoking + * the OOM killer. The following three modifiers might be used to override some of + * these implicit rules + * + * __GFP_NORETRY: The VM implementation will try only very lightweight + * memory direct reclaim to get some memory under memory pressure (thus + * it can sleep). It will avoid disruptive actions like OOM killer. The + * caller must handle the failure which is quite likely to happen under + * heavy memory pressure. The flag is suitable when failure can easily be + * handled at small cost, such as reduced throughput + * + * __GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim + * procedures that have previously failed if there is some indication + * that progress has been made else where. It can wait for other + * tasks to attempt high level approaches to freeing memory such as + * compaction (which removes fragmentation) and page-out. + * There is still a definite limit to the number of retries, but it is + * a larger limit than with __GFP_NORERY. + * Allocations with this flag may fail, but only when there is + * genuinely little unused memory. While these allocations do not + * directly trigger the OOM killer, their failure indicates that + * the system is likely to need to use the OOM killer soon. The + * caller must handle failure, but can reasonably do so by failing + * a higher-level request, or completing it only in a much less + * efficient manner. + * If the allocation does fail, and the caller is in a position to + * free some non-essential memory, doing so could benefit the system + * as a whole. * * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. New users should be evaluated carefully - * (and the flag should be used only when there is no reasonable failure - * policy) but it is definitely preferable to use the flag rather than - * opencode endless loop around allocator. - * - * __GFP_NORETRY: The VM implementation must not retry indefinitely and will - * return NULL when direct reclaim and memory compaction have failed to allow - * the allocation to succeed. The OOM killer is not called with the current - * implementation. + * cannot handle allocation failures. The allocation could block + * indefinitely but will never return with failure. Testing for + * failure is pointless. + * New users should be evaluated carefully (and the flag should be + * used only when there is no reasonable failure policy) but it is + * definitely preferable to use the flag rather than opencode endless + * loop around allocator. + * Using this flag for costly allocations is _highly_ discouraged. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ #define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) -#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) +#define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL) #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) diff --git a/include/linux/slab.h b/include/linux/slab.h index 04a7f7993e67..41473df6dfb0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -471,7 +471,8 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * * %__GFP_NOWARN - If allocation fails, don't issue any warnings. * - * %__GFP_REPEAT - If allocation fails initially, try once more before failing. + * %__GFP_RETRY_MAYFAIL - Try really hard to succeed the allocation but fail + * eventually. * * There are other flags available as well, but these are not intended * for general use, and so are not documented here. For a full list of diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 10e3663a75a6..8e50d01c645f 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -34,7 +34,7 @@ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ - {(unsigned long)__GFP_REPEAT, "__GFP_REPEAT"}, \ + {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ {(unsigned long)__GFP_NORETRY, "__GFP_NORETRY"}, \ {(unsigned long)__GFP_COMP, "__GFP_COMP"}, \ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e516520433d..bc48ee783dd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1384,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + __GFP_RETRY_MAYFAIL|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); @@ -1525,7 +1525,7 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, { int order = huge_page_order(h); - gfp_mask |= __GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); diff --git a/mm/internal.h b/mm/internal.h index 0e4f558412fb..24d88f084705 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_ATOMIC) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c533b4867d4..80e4adb4c360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3284,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + */ + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; @@ -3413,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, } /* - * !costly requests are much more important than __GFP_REPEAT + * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM * killer to move on while costly can fail and users are ready * to cope with that. 1/4 retries is rather arbitrary but we @@ -3920,9 +3928,9 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_REPEAT + * __GFP_RETRY_MAYFAIL */ - if (costly_order && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a56c3989f773..c50b1a14d55e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (node_state(node, N_HIGH_MEMORY)) page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); else page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); if (page) return page_address(page); diff --git a/mm/util.c b/mm/util.c index 21ddf90f883d..2f06100e1003 100644 --- a/mm/util.c +++ b/mm/util.c @@ -363,7 +363,7 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_RETRY_MAYFAIL * is supported only for large (>32kB) allocations, and it should be used only if * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. * @@ -391,11 +391,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) kmalloc_flags |= __GFP_NOWARN; /* - * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly + * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY for !costly * requests because there is no other way to tell the allocator * that we want to fail rather than retry endlessly. */ - if (!(kmalloc_flags & __GFP_REPEAT) || + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL) || (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) kmalloc_flags |= __GFP_NORETRY; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6016ab079e2b..8698c1c86c4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1795,7 +1795,7 @@ fail: * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted diff --git a/mm/vmscan.c b/mm/vmscan.c index f84cdd3751e1..efc9da21c5e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2506,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_REPEAT) { + if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { /* - * For __GFP_REPEAT allocations, stop reclaiming if the + * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the * full LRU list has been scanned and we are still failing * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_REPEAT caller really wants to succeed + * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed */ if (!nr_reclaimed && !nr_scanned) return false; } else { /* - * For non-__GFP_REPEAT allocations which can presumably + * For non-__GFP_RETRY_MAYFAIL allocations which can presumably * fail without consequence, stop if we failed to reclaim * any pages from the last SWAP_CLUSTER_MAX number of * pages that were scanned. This will return to the diff --git a/net/core/dev.c b/net/core/dev.c index 02440518dd69..8515f8fe0460 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7384,7 +7384,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -7424,7 +7424,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -7965,7 +7965,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b11341ed69a..f990eb8b30a9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4747,7 +4747,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, gfp_head = gfp_mask; if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_REPEAT; + gfp_head |= __GFP_RETRY_MAYFAIL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_head); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 147fde73a0f5..263d16e3219e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -648,7 +648,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 0a8a1c45af87..a1497c516d85 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -643,7 +643,7 @@ static const struct { { "__GFP_FS", "F" }, { "__GFP_COLD", "CO" }, { "__GFP_NOWARN", "NWR" }, - { "__GFP_REPEAT", "R" }, + { "__GFP_RETRY_MAYFAIL", "R" }, { "__GFP_NOFAIL", "NF" }, { "__GFP_NORETRY", "NR" }, { "__GFP_COMP", "C" }, -- cgit v1.2.3 From 916a635de172b3ab5e0e89abae2289498d6b7bbe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: mm-tree-wide-replace-__gfp_repeat-by-__gfp_retry_mayfail-with-more-useful-semantic-fix fix arch/sparc/kernel/mdesc.c Cc: Michal Hocko Cc: Stephen Rothwell Cc: David Miller Cc: Jag Raman Signed-off-by: Andrew Morton --- arch/sparc/kernel/mdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index e4b4e790bf89..fa466ce45bc9 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -205,7 +205,7 @@ static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size) handle_size = (sizeof(struct mdesc_handle) - sizeof(struct mdesc_hdr) + mdesc_size); - base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_REPEAT); + base = kmalloc(handle_size + 15, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!base) return NULL; -- cgit v1.2.3 From 5223c66e6e91c33c8edc2e8a395126bdd9ecaff5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: mm-tree-wide-replace-__gfp_repeat-by-__gfp_retry_mayfail-with-more-useful-semantic-fix - GFP_KERNEL - both background and direct reclaim are allowed and the _default_ page allocator behavior is used. That means that !costly allocation requests are basically nofail but there is no guarantee of that behavior so failures have to be checked properly by callers (e.g. OOM killer victim is allowed to fail currently). Link: http://lkml.kernel.org/r/20170626123847.GM11534@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6be1f836b69e..56bcb147910e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -25,7 +25,7 @@ struct vm_area_struct; #define ___GFP_FS 0x80u #define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u -#define ___GFP_RETRY_MAYFAIL 0x400u +#define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u #define ___GFP_NORETRY 0x1000u #define ___GFP_MEMALLOC 0x2000u @@ -139,10 +139,11 @@ struct vm_area_struct; * The default allocator behavior depends on the request size. We have a concept * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly - * non-failing (with some exceptions like OOM victims might fail) by default while - * costly requests try to be not disruptive and back off even without invoking - * the OOM killer. The following three modifiers might be used to override some of - * these implicit rules + * non-failing by default (with some exceptions like OOM victims might fail so + * the caller still has to check for failures) while costly requests try to be + * not disruptive and back off even without invoking the OOM killer. + * The following three modifiers might be used to override some of these + * implicit rules * * __GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus -- cgit v1.2.3 From bcc8f905208cd9f2971a72e7989ba3b5c9f656c6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:27 +1000 Subject: mm-tree-wide-replace-__gfp_repeat-by-__gfp_retry_mayfail-with-more-useful-semantic-fix-3 Forgot to address other thing spotted by Vlastimil. Link: http://lkml.kernel.org/r/20170626124233.GN11534@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 56bcb147910e..bcfb9f7c46f5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -158,7 +158,7 @@ struct vm_area_struct; * tasks to attempt high level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is - * a larger limit than with __GFP_NORERY. + * a larger limit than with __GFP_NORETRY. * Allocations with this flag may fail, but only when there is * genuinely little unused memory. While these allocations do not * directly trigger the OOM killer, their failure indicates that -- cgit v1.2.3 From 9caf77ed1a6c5f1bfe4c4e317617de970c8f63bb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: xfs: map KM_MAYFAIL to __GFP_RETRY_MAYFAIL KM_MAYFAIL didn't have any suitable GFP_FOO counterpart until recently so it relied on the default page allocator behavior for the given set of flags. This means that small allocations actually never failed. Now that we have __GFP_RETRY_MAYFAIL flag which works independently on the allocation request size we can map KM_MAYFAIL to it. The allocator will try as hard as it can to fulfill the request but fails eventually if the progress cannot be made. It does so without triggering the OOM killer which can be seen as an improvement because KM_MAYFAIL users should be able to deal with allocation failures. Link: http://lkml.kernel.org/r/20170623085345.11304-4-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Darrick J. Wong Cc: Christoph Hellwig Cc: Alex Belits Cc: Chris Wilson Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/xfs/kmem.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d6ea520162b2..4d85992d75b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -54,6 +54,16 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags &= ~__GFP_FS; } + /* + * Default page/slab allocator behavior is to retry for ever + * for small allocations. We can override this behavior by using + * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long + * as it is feasible but rather fail than retry forever for all + * request sizes. + */ + if (flags & KM_MAYFAIL) + lflags |= __GFP_RETRY_MAYFAIL; + if (flags & KM_ZERO) lflags |= __GFP_ZERO; -- cgit v1.2.3 From b2da43403a3cfbc04b1b5cba58fcff5013a7f905 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: mm: kvmalloc support __GFP_RETRY_MAYFAIL for all sizes Now that __GFP_RETRY_MAYFAIL has a reasonable semantic regardless of the request size we can drop the hackish implementation for !costly orders. __GFP_RETRY_MAYFAIL retries as long as the reclaim makes a forward progress and backs of when we are out of memory for the requested size. Therefore we do not need to enforce__GFP_NORETRY for !costly orders just to silent the oom killer anymore. Link: http://lkml.kernel.org/r/20170623085345.11304-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Belits Cc: Chris Wilson Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Signed-off-by: Andrew Morton --- mm/util.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/mm/util.c b/mm/util.c index 2f06100e1003..7b07ec852e01 100644 --- a/mm/util.c +++ b/mm/util.c @@ -363,9 +363,9 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_RETRY_MAYFAIL - * is supported only for large (>32kB) allocations, and it should be used only if - * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. */ @@ -390,13 +390,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; - /* - * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY for !costly - * requests because there is no other way to tell the allocator - * that we want to fail rather than retry endlessly. - */ - if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL) || - (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; } -- cgit v1.2.3 From f2410eeaa6c9a91e01f74661f171a23329d02b99 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: drm/i915: use __GFP_RETRY_MAYFAIL 24f8e00a8a2e ("drm/i915: Prefer to report ENOMEM rather than incur the oom for gfx allocations") has tried to remove disruptive OOM killer because the userspace should be able to cope with allocation failures. At the time only __GFP_NORETRY could achieve that and it turned out that this would fail the allocations just too easily. So "drm/i915: Remove __GFP_NORETRY from our buffer allocator" removed it and hoped for a better solution. __GFP_RETRY_MAYFAIL is that solution. It will keep retrying the allocation until there is no more progress and we would go OOM. Instead we fail the allocation and let the caller to deal with it. Link: http://lkml.kernel.org/r/20170623085345.11304-6-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Chris Wilson Cc: Alex Belits Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/i915_gem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7dcac3bfb771..969bac8404f1 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2434,8 +2434,9 @@ rebuild_st: * again with !__GFP_NORETRY. However, we still * want to fail this allocation rather than * trigger the out-of-memory killer and for - * this we want the future __GFP_MAYFAIL. + * this we want __GFP_RETRY_MAYFAIL. */ + gfp |= __GFP_RETRY_MAYFAIL; } } while (1); -- cgit v1.2.3 From 67298145e9a2dacb7beea65d415c83dbe5e8b31f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: mm, migration: do not trigger OOM killer when migrating memory Page migration (for memory hotplug, soft_offline_page or mbind) needs to allocate a new memory. This can trigger an oom killer if the target memory is depleated. Although quite unlikely, still possible, especially for the memory hotplug (offlining of memoery). Up to now we didn't really have reasonable means to back off. __GFP_NORETRY can fail just too easily and __GFP_THISNODE sticks to a single node and that is not suitable for all callers. But now that we have __GFP_RETRY_MAYFAIL we should use it. It is preferable to fail the migration than disrupt the system by killing some processes. Link: http://lkml.kernel.org/r/20170623085345.11304-7-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alex Belits Cc: Chris Wilson Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Daney Cc: Johannes Weiner Cc: Mel Gorman Cc: NeilBrown Cc: Ralf Baechle Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 +- mm/mempolicy.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4634da521238..3e0d405dc842 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -34,7 +34,7 @@ extern char *migrate_reason_names[MR_TYPES]; static inline struct page *new_page_nodemask(struct page *page, int preferred_nid, nodemask_t *nodemask) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; if (PageHuge(page)) return alloc_huge_page_nodemask(page_hstate(compound_head(page)), diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7d8e56214ac0..d911fa5cb2a7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1078,7 +1078,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) /* * if !vma, alloc_page_vma() will use task or system default policy */ - return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL, + vma, address); } #else -- cgit v1.2.3 From 13aec2e0b7507b4d4d976037f02f26902e80ef95 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: checkpatch: improve the STORAGE_CLASS test Make sure static, extern, and asmlinkage appear before a specific type. e.g.: int asmlinkage foo(void) is better written asmlinkage int foo(void) Link: http://lkml.kernel.org/r/31704c96df2d5fd9df0b41165940a7a4feb16a63.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8f940c09918f..2287a0bca863 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5576,10 +5576,18 @@ sub process { "architecture specific defines should be avoided\n" . $herecurr); } +# check that the storage class is not after a type + if ($line =~ /\b($Type)\s+($Storage)\b/) { + WARN("STORAGE_CLASS", + "storage class '$2' should be located before type '$1'\n" . $herecurr); + } # Check that the storage class is at the beginning of a declaration - if ($line =~ /\b$Storage\b/ && $line !~ /^.\s*$Storage\b/) { + if ($line =~ /\b$Storage\b/ && + $line !~ /^.\s*$Storage/ && + $line =~ /^.\s*(.+?)\$Storage\s/ && + $1 !~ /[\,\)]\s*$/) { WARN("STORAGE_CLASS", - "storage class should be at the beginning of the declaration\n" . $herecurr) + "storage class should be at the beginning of the declaration\n" . $herecurr); } # check the location of the inline attribute, that it is between -- cgit v1.2.3 From 30bb404945c577cd4fa92f3bee6d0d070c2e8b8e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: ARM: KVM: move asmlinkage before type asmlinkage is either 'extern "C"' or blank. Move the uses of asmlinkage before the return types to be similar to the rest of the kernel. Link: http://lkml.kernel.org/r/005b8e120650c6a13b541e420f4e3605603fe9e6.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Russell King Signed-off-by: Andrew Morton --- arch/arm/include/asm/kvm_hyp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 58508900c4bb..14b5903f0224 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -110,8 +110,8 @@ void __sysreg_restore_state(struct kvm_cpu_context *ctxt); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); -void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp); -void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp); +asmlinkage void __vfp_save_state(struct vfp_hard_struct *vfp); +asmlinkage void __vfp_restore_state(struct vfp_hard_struct *vfp); static inline bool __vfp_enabled(void) { return !(read_sysreg(HCPTR) & (HCPTR_TCP(11) | HCPTR_TCP(10))); @@ -120,8 +120,8 @@ static inline bool __vfp_enabled(void) void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt); void __hyp_text __banked_restore_state(struct kvm_cpu_context *ctxt); -int asmlinkage __guest_enter(struct kvm_vcpu *vcpu, +asmlinkage int __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host); -int asmlinkage __hyp_do_panic(const char *, int, u32); +asmlinkage int __hyp_do_panic(const char *, int, u32); #endif /* __ARM_KVM_HYP_H__ */ -- cgit v1.2.3 From 26893f317d55d66c1d55a71ab911f538766f6c96 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: ARM: HP Jornada 7XX: move inline before return type Convert 'u8 inline' to 'inline u8' to be the same style used by the rest of the kernel. Miscellanea: jornada_ssp_reverse is an odd function. It is declared inline but is also EXPORT_SYMBOL. It is also apparently only used by jornada720_ssp.c Likely the EXPORT_SYMBOL could be removed and the function converted to static. The addition of static and removal of EXPORT_SYMBOL was not done. Link: http://lkml.kernel.org/r/5bd3b2bf39c6c9caf773949f18158f8f5ec08582.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Russell King Signed-off-by: Andrew Morton --- arch/arm/mach-sa1100/jornada720_ssp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-sa1100/jornada720_ssp.c b/arch/arm/mach-sa1100/jornada720_ssp.c index b143c4659346..7fc11a3c17b4 100644 --- a/arch/arm/mach-sa1100/jornada720_ssp.c +++ b/arch/arm/mach-sa1100/jornada720_ssp.c @@ -33,7 +33,7 @@ static unsigned long jornada_ssp_flags; * we need to reverse all data we receive from the mcu due to its physical location * returns : 01110111 -> 11101110 */ -u8 inline jornada_ssp_reverse(u8 byte) +inline u8 jornada_ssp_reverse(u8 byte) { return ((0x80 & byte) >> 7) | -- cgit v1.2.3 From f909b519192a473762324395a352d8681922a775 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:28 +1000 Subject: CRIS: gpio: move inline before return type Move inline to be like the rest of the kernel. Link: http://lkml.kernel.org/r/6bf1bec049897c4158f698b866810f47c728f233.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Mikael Starvik Cc: Jesper Nilsson Signed-off-by: Andrew Morton --- arch/cris/arch-v10/drivers/gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/cris/arch-v10/drivers/gpio.c b/arch/cris/arch-v10/drivers/gpio.c index 64285e0d3481..dfd3b3ba5e4e 100644 --- a/arch/cris/arch-v10/drivers/gpio.c +++ b/arch/cris/arch-v10/drivers/gpio.c @@ -399,7 +399,7 @@ out: /* Main device API. ioctl's to read/set/clear bits, as well as to * set alarms to wait for using a subsequent select(). */ -unsigned long inline setget_input(struct gpio_private *priv, unsigned long arg) +inline unsigned long setget_input(struct gpio_private *priv, unsigned long arg) { /* Set direction 0=unchanged 1=input, * return mask with 1=input */ @@ -450,7 +450,7 @@ unsigned long inline setget_input(struct gpio_private *priv, unsigned long arg) return dir_g_in_bits; } /* setget_input */ -unsigned long inline setget_output(struct gpio_private *priv, unsigned long arg) +inline unsigned long setget_output(struct gpio_private *priv, unsigned long arg) { if (USE_PORTS(priv)) { *priv->dir = *priv->dir_shadow |= -- cgit v1.2.3 From 2d95c5da0460f6781aad8cb847677c7b94af4162 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: FRV: tlbflush: move asmlinkage before return type Make the use of asmlinkage like the rest of the kernel. Link: http://lkml.kernel.org/r/efb2dfed4d9315bf68ec0334c81b65af176a0174.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: David Howells Signed-off-by: Andrew Morton --- arch/frv/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/frv/include/asm/tlbflush.h b/arch/frv/include/asm/tlbflush.h index 7ac5eafc5d98..75879420f578 100644 --- a/arch/frv/include/asm/tlbflush.h +++ b/arch/frv/include/asm/tlbflush.h @@ -18,10 +18,10 @@ #ifdef CONFIG_MMU #ifndef __ASSEMBLY__ -extern void asmlinkage __flush_tlb_all(void); -extern void asmlinkage __flush_tlb_mm(unsigned long contextid); -extern void asmlinkage __flush_tlb_page(unsigned long contextid, unsigned long start); -extern void asmlinkage __flush_tlb_range(unsigned long contextid, +extern asmlinkage void __flush_tlb_all(void); +extern asmlinkage void __flush_tlb_mm(unsigned long contextid); +extern asmlinkage void __flush_tlb_page(unsigned long contextid, unsigned long start); +extern asmlinkage void __flush_tlb_range(unsigned long contextid, unsigned long start, unsigned long end); #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3 From b8454a5f9322a9a52176217a9fdda09ef19f0f31 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: ia64: move inline before return type Make the use of inline like the rest of the kernel. Link: http://lkml.kernel.org/r/d47074493af80ce12590340294bc49618165c30d.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton --- arch/ia64/kernel/mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 79c7c46d7dc1..555b11180156 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -334,7 +334,7 @@ static void ia64_mlogbuf_dump_from_init(void) ia64_mlogbuf_dump(); } -static void inline +static inline void ia64_mca_spin(const char *func) { if (monarch_cpu == smp_processor_id()) -- cgit v1.2.3 From ca6413fddac6a64a079ab7b28a922dda01fe88a0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: ia64: sn: pci: move inline before type Make the use of inline like the rest of the kernel. Link: http://lkml.kernel.org/r/f42b2202bd0d4e7ccf79ce5348bb255a035e67bb.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton --- arch/ia64/sn/pci/pcibr/pcibr_ate.c | 2 +- arch/ia64/sn/pci/tioce_provider.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ia64/sn/pci/pcibr/pcibr_ate.c b/arch/ia64/sn/pci/pcibr/pcibr_ate.c index 5bc34eac9e01..b67bb4cb73ff 100644 --- a/arch/ia64/sn/pci/pcibr/pcibr_ate.c +++ b/arch/ia64/sn/pci/pcibr/pcibr_ate.c @@ -140,7 +140,7 @@ static inline u64 __iomem *pcibr_ate_addr(struct pcibus_info *pcibus_info, /* * Update the ate. */ -void inline +inline void ate_write(struct pcibus_info *pcibus_info, int ate_index, int count, volatile u64 ate) { diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c index 46d3df4b03a1..3bd9abc35485 100644 --- a/arch/ia64/sn/pci/tioce_provider.c +++ b/arch/ia64/sn/pci/tioce_provider.c @@ -52,7 +52,7 @@ * All registers defined in struct tioce will meet that criteria. */ -static void inline +static inline void tioce_mmr_war_pre(struct tioce_kernel *kern, void __iomem *mmr_addr) { u64 mmr_base; @@ -78,7 +78,7 @@ tioce_mmr_war_pre(struct tioce_kernel *kern, void __iomem *mmr_addr) } } -static void inline +static inline void tioce_mmr_war_post(struct tioce_kernel *kern, void __iomem *mmr_addr) { u64 mmr_base; -- cgit v1.2.3 From 8fc7cb9f319b6fdf9c1fda374a5d418c22948d62 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: m68k: coldfire: move inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/14db9c166d5b68efa77e337cfe49bb9b29bca3f7.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Acked-by: Greg Ungerer Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- arch/m68k/coldfire/intc-simr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/coldfire/intc-simr.c b/arch/m68k/coldfire/intc-simr.c index 7cf2c156f72d..15c4b7a6e38f 100644 --- a/arch/m68k/coldfire/intc-simr.c +++ b/arch/m68k/coldfire/intc-simr.c @@ -35,7 +35,7 @@ #define EINT7 67 /* EDGE Port interrupt 7 */ static unsigned int irqebitmap[] = { 0, 1, 4, 7 }; -static unsigned int inline irq2ebit(unsigned int irq) +static inline unsigned int irq2ebit(unsigned int irq) { return irqebitmap[irq - EINT0]; } @@ -51,7 +51,7 @@ static unsigned int inline irq2ebit(unsigned int irq) #define EINT1 65 /* EDGE Port interrupt 1 */ #define EINT7 71 /* EDGE Port interrupt 7 */ -static unsigned int inline irq2ebit(unsigned int irq) +static inline unsigned int irq2ebit(unsigned int irq) { return irq - EINT0; } -- cgit v1.2.3 From 0924e8d4e6f82ed5d4b680d7e02d2edf86c9e10e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: MIPS: SMP: move asmlinkage before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/756d3fb543e981b9284e756fa27616725a354b28.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Ralf Baechle Signed-off-by: Andrew Morton --- arch/mips/include/asm/smp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h index 98a117a05fbc..bab3d41e5987 100644 --- a/arch/mips/include/asm/smp.h +++ b/arch/mips/include/asm/smp.h @@ -47,7 +47,7 @@ extern int __cpu_logical_map[NR_CPUS]; /* Mask of CPUs which are currently definitely operating coherently */ extern cpumask_t cpu_coherent_mask; -extern void asmlinkage smp_bootstrap(void); +extern asmlinkage void smp_bootstrap(void); extern void calculate_cpu_foreign_map(void); -- cgit v1.2.3 From 60d99217a582a81f009d61c176c7ca6d2b305a0a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: sh: move inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/f81bb2a67a97b1fd8b6ea99bd350d8a0f6864fb1.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton --- arch/sh/mm/cache-sh5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c index d94dadedf74f..445b5e69b73c 100644 --- a/arch/sh/mm/cache-sh5.c +++ b/arch/sh/mm/cache-sh5.c @@ -234,7 +234,7 @@ static void sh64_icache_inv_current_user_range(unsigned long start, unsigned lon #define DUMMY_ALLOCO_AREA_SIZE ((L1_CACHE_BYTES << 10) + (1024 * 4)) static unsigned char dummy_alloco_area[DUMMY_ALLOCO_AREA_SIZE] __cacheline_aligned = { 0, }; -static void inline sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets) +static inline void sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets) { /* Purge all ways in a particular block of sets, specified by the base set number and number of sets. Can handle wrap-around, if that's -- cgit v1.2.3 From 55e7b32be1238d96c6766532238d3f4751be8337 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:29 +1000 Subject: x86/efi: move asmlinkage before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/1cd3d401626e51ea0e2333a860e76e80bc560a4c.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Matt Fleming Cc: Ard Biesheuvel Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/include/asm/efi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index d2ff779f347e..796ff6c1aa53 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -33,7 +33,7 @@ #ifdef CONFIG_X86_32 -extern unsigned long asmlinkage efi_call_phys(void *, ...); +extern asmlinkage unsigned long efi_call_phys(void *, ...); #define arch_efi_call_virt_setup() kernel_fpu_begin() #define arch_efi_call_virt_teardown() kernel_fpu_end() @@ -52,7 +52,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define EFI_LOADER_SIGNATURE "EL64" -extern u64 asmlinkage efi_call(void *fp, ...); +extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) -- cgit v1.2.3 From 76d9bc220fdf6da97ebd3a11b6c338c7a6250f01 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: drivers: s390: move static and inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/3f980cd89084ae09716353aba3171e4b3815e690.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Julian Wiedmann Cc: Ursula Braun Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton --- drivers/s390/net/ctcm_main.c | 2 +- drivers/s390/net/qeth_l3_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c index 1563b1458e44..2ade6131a89f 100644 --- a/drivers/s390/net/ctcm_main.c +++ b/drivers/s390/net/ctcm_main.c @@ -1115,7 +1115,7 @@ static const struct net_device_ops ctcm_mpc_netdev_ops = { .ndo_start_xmit = ctcmpc_tx, }; -void static ctcm_dev_setup(struct net_device *dev) +static void ctcm_dev_setup(struct net_device *dev) { dev->type = ARPHRD_SLIP; dev->tx_queue_len = 100; diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 3062cde33a3d..8975cd321390 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -2408,7 +2408,7 @@ static int qeth_l3_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return rc; } -int inline qeth_l3_get_cast_type(struct qeth_card *card, struct sk_buff *skb) +inline int qeth_l3_get_cast_type(struct qeth_card *card, struct sk_buff *skb) { int cast_type = RTN_UNSPEC; struct neighbour *n = NULL; -- cgit v1.2.3 From f57b74fce556d27b1831f00d228bd74e5a105bac Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: drivers: tty: serial: move inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/55d3e89d50bb03d603bfb28019fab07f48bdc714.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Pat Gefre Cc: Greg Kroah-Hartman Cc: Jiri Slaby Signed-off-by: Andrew Morton --- drivers/tty/serial/ioc3_serial.c | 4 ++-- drivers/tty/serial/ioc4_serial.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/ioc3_serial.c b/drivers/tty/serial/ioc3_serial.c index 2a61dd6b4009..906ee770ff4a 100644 --- a/drivers/tty/serial/ioc3_serial.c +++ b/drivers/tty/serial/ioc3_serial.c @@ -377,7 +377,7 @@ static struct ioc3_port *get_ioc3_port(struct uart_port *the_port) * called per port from attach... * @port: port to initialize */ -static int inline port_init(struct ioc3_port *port) +static inline int port_init(struct ioc3_port *port) { uint32_t sio_cr; struct port_hooks *hooks = port->ip_hooks; @@ -1430,7 +1430,7 @@ static int receive_chars(struct uart_port *the_port) * @pending: interrupts to handle */ -static int inline +static inline int ioc3uart_intr_one(struct ioc3_submodule *is, struct ioc3_driver_data *idd, unsigned int pending) diff --git a/drivers/tty/serial/ioc4_serial.c b/drivers/tty/serial/ioc4_serial.c index f96bcf9bee25..43d7d32eb150 100644 --- a/drivers/tty/serial/ioc4_serial.c +++ b/drivers/tty/serial/ioc4_serial.c @@ -824,7 +824,7 @@ pending_intrs(struct ioc4_soft *soft, int type) * called per port from attach... * @port: port to initialize */ -static int inline port_init(struct ioc4_port *port) +static inline int port_init(struct ioc4_port *port) { uint32_t sio_cr; struct hooks *hooks = port->ip_hooks; @@ -1048,7 +1048,7 @@ static irqreturn_t ioc4_intr(int irq, void *arg) * IOC4 with serial ports in the system. * @idd: Master module data for this IOC4 */ -static int inline ioc4_attach_local(struct ioc4_driver_data *idd) +static inline int ioc4_attach_local(struct ioc4_driver_data *idd) { struct ioc4_port *port; struct ioc4_port *ports[IOC4_NUM_SERIAL_PORTS]; -- cgit v1.2.3 From fd8eab5cb1193978ea923b5b5f480f5dcad52026 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: USB: serial: safe_serial: move __inline__ before return type Make the code like the rest of the kernel. Also use inline instead of __inline__. Link: http://lkml.kernel.org/r/a5072b74b6c293e6ec93c4900482e9d3267f15b2.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Johan Hovold Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/usb/serial/safe_serial.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/safe_serial.c b/drivers/usb/serial/safe_serial.c index 8a069aa154ed..27d7a7016298 100644 --- a/drivers/usb/serial/safe_serial.c +++ b/drivers/usb/serial/safe_serial.c @@ -180,7 +180,7 @@ static const __u16 crc10_table[256] = { * Perform a memcpy and calculate fcs using ppp 10bit CRC algorithm. Return * new 10 bit FCS. */ -static __u16 __inline__ fcs_compute10(unsigned char *sp, int len, __u16 fcs) +static inline __u16 fcs_compute10(unsigned char *sp, int len, __u16 fcs) { for (; len-- > 0; fcs = CRC10_FCS(fcs, *sp++)); return fcs; -- cgit v1.2.3 From a0617b888b702f3056cb9795b9892df1613ae03d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: video: fbdev: intelfb: move inline before return type Make the code like the rest of the kernel. But there is an oddity here because the inline should probably be removed. It's an extern function in intelfb.h and it is used in intelfbdrv.c and intelfbhw.c. The inline is kept here as I suppose it's possible for some compiler to make the uses inline in intelfbdrv and and also create an external function for intelfbhw. Link: http://lkml.kernel.org/r/8ba151a1fdc84e42cbf4aafc798513c0158edee1.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Maik Broemme Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton --- drivers/video/fbdev/intelfb/intelfbdrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/intelfb/intelfbdrv.c b/drivers/video/fbdev/intelfb/intelfbdrv.c index 6b444400a86c..ffc391208b27 100644 --- a/drivers/video/fbdev/intelfb/intelfbdrv.c +++ b/drivers/video/fbdev/intelfb/intelfbdrv.c @@ -907,7 +907,7 @@ static void intelfb_pci_unregister(struct pci_dev *pdev) * helper functions * ***************************************************************/ -int __inline__ intelfb_var_to_depth(const struct fb_var_screeninfo *var) +__inline__ int intelfb_var_to_depth(const struct fb_var_screeninfo *var) { DBG_MSG("intelfb_var_to_depth: bpp: %d, green.length is %d\n", var->bits_per_pixel, var->green.length); -- cgit v1.2.3 From 196b60ec95e00317ac2635a53fbb9eb521925300 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: video: fbdev: omap: move inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/bc5927726abc70d7c066df7ab4cb7cfce4a7b577.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Cc: Tomi Valkeinen Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton --- drivers/video/fbdev/omap/lcdc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/video/fbdev/omap/lcdc.c b/drivers/video/fbdev/omap/lcdc.c index e3d9b9ea5498..938cba0d24ae 100644 --- a/drivers/video/fbdev/omap/lcdc.c +++ b/drivers/video/fbdev/omap/lcdc.c @@ -79,12 +79,12 @@ static struct omap_lcd_controller { unsigned long vram_size; } lcdc; -static void inline enable_irqs(int mask) +static inline void enable_irqs(int mask) { lcdc.irq_mask |= mask; } -static void inline disable_irqs(int mask) +static inline void disable_irqs(int mask) { lcdc.irq_mask &= ~mask; } @@ -466,7 +466,7 @@ static void calc_ck_div(int is_tft, int pck, int *pck_div) } } -static void inline setup_regs(void) +static inline void setup_regs(void) { u32 l; struct lcd_panel *panel = lcdc.fbdev->panel; -- cgit v1.2.3 From 6c4239003a2c4cb37b5177144c3e7effe62abda1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: ARM: samsung: usb-ohci: move inline before return type Make the code like the rest of the kernel. Link: http://lkml.kernel.org/r/667a515b8d0f10f2465d519f8595edd91552fc5e.1499284835.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton --- include/linux/platform_data/usb-ohci-s3c2410.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/platform_data/usb-ohci-s3c2410.h b/include/linux/platform_data/usb-ohci-s3c2410.h index 7fa1fbefc3f2..cc7554ae6e8b 100644 --- a/include/linux/platform_data/usb-ohci-s3c2410.h +++ b/include/linux/platform_data/usb-ohci-s3c2410.h @@ -31,7 +31,7 @@ struct s3c2410_hcd_info { void (*report_oc)(struct s3c2410_hcd_info *, int ports); }; -static void inline s3c2410_usb_report_oc(struct s3c2410_hcd_info *info, int ports) +static inline void s3c2410_usb_report_oc(struct s3c2410_hcd_info *info, int ports) { if (info->report_oc != NULL) { (info->report_oc)(info, ports); -- cgit v1.2.3 From 8a9acbfbb944ac269cb844aec97292c23c0587c4 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: sparc64: NG4 memset 32 bits overflow Early in boot Linux patches memset and memcpy to branch to platform optimized versions of these routines. The NG4 (Niagra 4) versions are currently used on all platforms starting from T4. Recently, there were M7 optimized routines added into UEK4 but not into mainline yet. So, even with M7 optimized routines NG4 are still going to be used on T4, T5, M5, and M6 processors. While investigating how to improve initialization time of dentry_hashtable which is 8G long on M6 ldom with 7T of main memory, I noticed that memset() does not reset all the memory in this array, after studying the code, I realized that NG4memset() branches use %icc register instead of %xcc to check compare, so if value of length is over 32-bit long, which is true for 8G array, these routines fail to work properly. The fix is to replace all %icc with %xcc in these routines. (Alternative is to use %ncc, but this is misleading, as the code already has sparcv9 only instructions, and cannot be compiled on 32-bit). This is important to fix this bug, because even older T4-4 can have 2T of memory, and there are large memory proportional data structures in kernel which can be larger than 4G in size. The failing of memset() is silent and corruption is hard to detect. Link: http://lkml.kernel.org/r/1488432825-92126-2-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton --- arch/sparc/lib/NG4memset.S | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S index 7c0c81f18837..c07834f3a6fc 100644 --- a/arch/sparc/lib/NG4memset.S +++ b/arch/sparc/lib/NG4memset.S @@ -13,14 +13,14 @@ .globl NG4memset NG4memset: andcc %o1, 0xff, %o4 - be,pt %icc, 1f + be,pt %xcc, 1f mov %o2, %o1 sllx %o4, 8, %g1 or %g1, %o4, %o2 sllx %o2, 16, %g1 or %g1, %o2, %o2 sllx %o2, 32, %g1 - ba,pt %icc, 1f + ba,pt %xcc, 1f or %g1, %o2, %o4 .size NG4memset,.-NG4memset @@ -29,7 +29,7 @@ NG4memset: NG4bzero: clr %o4 1: cmp %o1, 16 - ble %icc, .Ltiny + ble %xcc, .Ltiny mov %o0, %o3 sub %g0, %o0, %g1 and %g1, 0x7, %g1 @@ -37,7 +37,7 @@ NG4bzero: sub %o1, %g1, %o1 1: stb %o4, [%o0 + 0x00] subcc %g1, 1, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 1, %o0 .Laligned8: cmp %o1, 64 + (64 - 8) @@ -48,7 +48,7 @@ NG4bzero: sub %o1, %g1, %o1 1: stx %o4, [%o0 + 0x00] subcc %g1, 8, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x8, %o0 .Laligned64: andn %o1, 64 - 1, %g1 @@ -58,30 +58,30 @@ NG4bzero: 1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P subcc %g1, 0x40, %g1 stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x40, %o0 .Lpostloop: cmp %o1, 8 - bl,pn %icc, .Ltiny + bl,pn %xcc, .Ltiny membar #StoreStore|#StoreLoad .Lmedium: andn %o1, 0x7, %g1 sub %o1, %g1, %o1 1: stx %o4, [%o0 + 0x00] subcc %g1, 0x8, %g1 - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x08, %o0 andcc %o1, 0x4, %g1 - be,pt %icc, .Ltiny + be,pt %xcc, .Ltiny sub %o1, %g1, %o1 stw %o4, [%o0 + 0x00] add %o0, 0x4, %o0 .Ltiny: cmp %o1, 0 - be,pn %icc, .Lexit + be,pn %xcc, .Lexit 1: subcc %o1, 1, %o1 stb %o4, [%o0 + 0x00] - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 1, %o0 .Lexit: retl @@ -99,8 +99,8 @@ NG4bzero: stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P - bne,pt %icc, 1b + bne,pt %xcc, 1b add %o0, 0x30, %o0 - ba,a,pt %icc, .Lpostloop + ba,a,pt %xcc, .Lpostloop nop .size NG4bzero,.-NG4bzero -- cgit v1.2.3 From b2bed66ba8873b0bd27ad324071550a4cdff4c6b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 11 Jul 2017 09:23:30 +1000 Subject: xtensa: use generic fb.h The arch uses a verbatim copy of the asm-generic version and does not add any own implementations to the header, so use asm-generic/fb.h instead of duplicating code. Link: http://lkml.kernel.org/r/20170517083545.2115-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Acked-by: Max Filippov Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/Kbuild | 1 + arch/xtensa/include/asm/fb.h | 12 ------------ 2 files changed, 1 insertion(+), 12 deletions(-) delete mode 100644 arch/xtensa/include/asm/fb.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index c04efde775a5..2d716ebc5a5e 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -5,6 +5,7 @@ generic-y += dma-contiguous.h generic-y += emergency-restart.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += hardirq.h generic-y += irq_regs.h generic-y += irq_work.h diff --git a/arch/xtensa/include/asm/fb.h b/arch/xtensa/include/asm/fb.h deleted file mode 100644 index c7df38030992..000000000000 --- a/arch/xtensa/include/asm/fb.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ -- cgit v1.2.3 From 2f320f78ec812fb0644e133c469fc80feb5cf6f6 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: MAINTAINERS: give kmod some maintainer love As suggested by Jessica, I've been actively working on kmod, so might as well reflect its maintained status. Changes are expected to go through akpm's tree. Link: http://lkml.kernel.org/r/20170628223155.26472-2-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 789a05d08746..35da53f48794 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7575,6 +7575,13 @@ F: include/linux/kmemleak.h F: mm/kmemleak.c F: mm/kmemleak-test.c +KMOD MODULE USERMODE HELPER +M: "Luis R. Rodriguez" +L: linux-kernel@vger.kernel.org +S: Maintained +F: kernel/kmod.c +F: include/linux/kmod.h + KPROBES M: Ananth N Mavinakayanahalli M: Anil S Keshavamurthy -- cgit v1.2.3 From 0a5653e6f90b64a310c1336fe58e444ce99169b3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: kmod: add test driver to stress test the module loader This adds a new stress test driver for kmod: the kernel module loader. The new stress test driver, test_kmod, is only enabled as a module right now. It should be possible to load this as built-in and load tests early (refer to the force_init_test module parameter), however since a lot of test can get a system out of memory fast we leave this disabled for now. Using a system with 1024 MiB of RAM can *easily* get your kernel OOM fast with this test driver. The test_kmod driver exposes API knobs for us to fine tune simple request_module() and get_fs_type() calls. Since these API calls only allow each one parameter a test driver for these is rather simple. Other factors that can help out test driver though are the number of calls we issue and knowing current limitations of each. This exposes configuration as much as possible through userspace to be able to build tests directly from userspace. Since it allows multiple misc devices its will eventually (once we add a knob to let us create new devices at will) also be possible to perform more tests in parallel, provided you have enough memory. We only enable tests we know work as of right now. Demo screenshots: # tools/testing/selftests/kmod/kmod.sh kmod_test_0001_driver: OK! - loading kmod test kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0001_fs: OK! - loading kmod test kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL kmod_test_0002_driver: OK! - loading kmod test kmod_test_0002_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0002_fs: OK! - loading kmod test kmod_test_0002_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL kmod_test_0003: OK! - loading kmod test kmod_test_0003: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0004: OK! - loading kmod test kmod_test_0004: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0005: OK! - loading kmod test kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0006: OK! - loading kmod test kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0005: OK! - loading kmod test kmod_test_0005: OK! - Return value: 0 (SUCCESS), expected SUCCESS kmod_test_0006: OK! - loading kmod test kmod_test_0006: OK! - Return value: 0 (SUCCESS), expected SUCCESS XXX: add test restult for 0007 Test completed You can also request for specific tests: # tools/testing/selftests/kmod/kmod.sh -t 0001 kmod_test_0001_driver: OK! - loading kmod test kmod_test_0001_driver: OK! - Return value: 256 (MODULE_NOT_FOUND), expected MODULE_NOT_FOUND kmod_test_0001_fs: OK! - loading kmod test kmod_test_0001_fs: OK! - Return value: -22 (-EINVAL), expected -EINVAL Test completed Lastly, the current available number of tests: # tools/testing/selftests/kmod/kmod.sh --help Usage: tools/testing/selftests/kmod/kmod.sh [ -t <4-number-digit> ] Valid tests: 0001-0009 0001 - Simple test - 1 thread for empty string 0002 - Simple test - 1 thread for modules/filesystems that do not exist 0003 - Simple test - 1 thread for get_fs_type() only 0004 - Simple test - 2 threads for get_fs_type() only 0005 - multithreaded tests with default setup - request_module() only 0006 - multithreaded tests with default setup - get_fs_type() only 0007 - multithreaded tests with default setup test request_module() and get_fs_type() 0008 - multithreaded - push kmod_concurrent over max_modprobes for request_module() 0009 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type() The following test cases currently fail, as such they are not currently enabled by default: # tools/testing/selftests/kmod/kmod.sh -t 0008 # tools/testing/selftests/kmod/kmod.sh -t 0009 To be sure to run them as intended please unload both of the modules: o test_module o xfs And ensure they are not loaded on your system prior to testing them. If you use these paritions for your rootfs you can change the default test driver used for get_fs_type() by exporting it into your environment. For example of other test defaults you can override refer to kmod.sh allow_user_defaults(). Behind the scenes this is how we fine tune at a test case prior to hitting a trigger to run it: cat /sys/devices/virtual/misc/test_kmod0/config echo -n "2" > /sys/devices/virtual/misc/test_kmod0/config_test_case echo -n "ext4" > /sys/devices/virtual/misc/test_kmod0/config_test_fs echo -n "80" > /sys/devices/virtual/misc/test_kmod0/config_num_threads cat /sys/devices/virtual/misc/test_kmod0/config echo -n "1" > /sys/devices/virtual/misc/test_kmod0/config_num_threads Finally to trigger: echo -n "1" > /sys/devices/virtual/misc/test_kmod0/trigger_config The kmod.sh script uses the above constructs to build different test cases. A bit of interpretation of the current failures follows, first two premises: a) When request_module() is used userspace figures out an optimized version of module order for us. Once it finds the modules it needs, as per depmod symbol dep map, it will finit_module() the respective modules which are needed for the original request_module() request. b) We have an optimization in place whereby if a kernel uses request_module() on a module already loaded we never bother userspace as the module already is loaded. This is all handled by kernel/kmod.c. A few things to consider to help identify root causes of issues: 0) kmod 19 has a broken heuristic for modules being assumed to be built-in to your kernel and will return 0 even though request_module() failed. Upgrade to a newer version of kmod. 1) A get_fs_type() call for "xfs" will request_module() for "fs-xfs", not for "xfs". The optimization in kernel described in b) fails to catch if we have a lot of consecutive get_fs_type() calls. The reason is the optimization in place does not look for aliases. This means two consecutive get_fs_type() calls will bump kmod_concurrent, whereas request_module() will not. This one explanation why test case 0009 fails at least once for get_fs_type(). 2) If a module fails to load --- for whatever reason (kmod_concurrent limit reached, file not yet present due to rootfs switch, out of memory) we have a period of time during which module request for the same name either with request_module() or get_fs_type() will *also* fail to load even if the file for the module is ready. This explains why *multiple* NULLs are possible on test 0009. 3) finit_module() consumes quite a bit of memory. 4) Filesystems typically also have more dependent modules than other modules, its important to note though that even though a get_fs_type() call does not incur additional kmod_concurrent bumps, since userspace loads dependencies it finds it needs via finit_module_fd(), it *will* take much more memory to load a module with a lot of dependencies. Because of 3) and 4) we will easily run into out of memory failures with certain tests. For instance test 0006 fails on qemu with 1024 MiB of RAM. It panics a box after reaping all userspace processes and still not having enough memory to reap. Link: http://lkml.kernel.org/r/20170628223155.26472-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Petr Mladek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- MAINTAINERS | 2 + lib/Kconfig.debug | 25 + lib/Makefile | 1 + lib/test_kmod.c | 1246 +++++++++++++++++++++++++++++++++ tools/testing/selftests/kmod/Makefile | 11 + tools/testing/selftests/kmod/config | 7 + tools/testing/selftests/kmod/kmod.sh | 635 +++++++++++++++++ 7 files changed, 1927 insertions(+) create mode 100644 lib/test_kmod.c create mode 100644 tools/testing/selftests/kmod/Makefile create mode 100644 tools/testing/selftests/kmod/config create mode 100644 tools/testing/selftests/kmod/kmod.sh diff --git a/MAINTAINERS b/MAINTAINERS index 35da53f48794..2af3ab3f7702 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7581,6 +7581,8 @@ L: linux-kernel@vger.kernel.org S: Maintained F: kernel/kmod.c F: include/linux/kmod.h +F: lib/test_kmod.c +F: tools/testing/selftests/kmod/ KPROBES M: Ananth N Mavinakayanahalli diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 642c2af57e10..b0c35e0667df 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1864,6 +1864,31 @@ config BUG_ON_DATA_CORRUPTION If unsure, say N. +config TEST_KMOD + tristate "kmod stress tester" + default n + depends on m + select TEST_LKM + select XFS_FS + select TUN + select BTRFS_FS + help + Test the kernel's module loading mechanism: kmod. kmod implements + support to load modules using the Linux kernel's usermode helper. + This test provides a series of tests against kmod. + + Although technically you can either build test_kmod as a module or + into the kernel we disallow building it into the kernel since + it stress tests request_module() and this will very likely cause + some issues by taking over precious threads available from other + module load requests, ultimately this could be fatal. + + To run tests run: + + tools/testing/selftests/kmod/kmod.sh --help + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" diff --git a/lib/Makefile b/lib/Makefile index 5a008329324e..678c6c38a991 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -60,6 +60,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o +obj-$(CONFIG_TEST_KMOD) += test_kmod.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/test_kmod.c b/lib/test_kmod.c new file mode 100644 index 000000000000..6c1d678bcf8b --- /dev/null +++ b/lib/test_kmod.c @@ -0,0 +1,1246 @@ +/* + * kmod stress test driver + * + * Copyright (C) 2017 Luis R. Rodriguez + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or at your option any + * later version; or, when distributed separately from the Linux kernel or + * when incorporated into other software packages, subject to the following + * license: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of copyleft-next (version 0.3.1 or later) as published + * at http://copyleft-next.org/. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +/* + * This driver provides an interface to trigger and test the kernel's + * module loader through a series of configurations and a few triggers. + * To test this driver use the following script as root: + * + * tools/testing/selftests/kmod/kmod.sh --help + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TEST_START_NUM_THREADS 50 +#define TEST_START_DRIVER "test_module" +#define TEST_START_TEST_FS "xfs" +#define TEST_START_TEST_CASE TEST_KMOD_DRIVER + + +static bool force_init_test = false; +module_param(force_init_test, bool_enable_only, 0644); +MODULE_PARM_DESC(force_init_test, + "Force kicking a test immediately after driver loads"); + +/* + * For device allocation / registration + */ +static DEFINE_MUTEX(reg_dev_mutex); +static LIST_HEAD(reg_test_devs); + +/* + * num_test_devs actually represents the *next* ID of the next + * device we will allow to create. + */ +static int num_test_devs; + +/** + * enum kmod_test_case - linker table test case + * + * If you add a test case, please be sure to review if you need to se + * @need_mod_put for your tests case. + * + * @TEST_KMOD_DRIVER: stress tests request_module() + * @TEST_KMOD_FS_TYPE: stress tests get_fs_type() + */ +enum kmod_test_case { + __TEST_KMOD_INVALID = 0, + + TEST_KMOD_DRIVER, + TEST_KMOD_FS_TYPE, + + __TEST_KMOD_MAX, +}; + +struct test_config { + char *test_driver; + char *test_fs; + unsigned int num_threads; + enum kmod_test_case test_case; + int test_result; +}; + +struct kmod_test_device; + +/** + * kmod_test_device_info - thread info + * + * @ret_sync: return value if request_module() is used, sync request for + * @TEST_KMOD_DRIVER + * @fs_sync: return value of get_fs_type() for @TEST_KMOD_FS_TYPE + * @thread_idx: thread ID + * @test_dev: test device test is being performed under + * @need_mod_put: Some tests (get_fs_type() is one) requires putting the module + * (module_put(fs_sync->owner)) when done, otherwise you will not be able + * to unload the respective modules and re-test. We use this to keep + * accounting of when we need this and to help out in case we need to + * error out and deal with module_put() on error. + */ +struct kmod_test_device_info { + int ret_sync; + struct file_system_type *fs_sync; + struct task_struct *task_sync; + unsigned int thread_idx; + struct kmod_test_device *test_dev; + bool need_mod_put; +}; + +/** + * kmod_test_device - test device to help test kmod + * + * @dev_idx: unique ID for test device + * @config: configuration for the test + * @misc_dev: we use a misc device under the hood + * @dev: pointer to misc_dev's own struct device + * @config_mutex: protects configuration of test + * @trigger_mutex: the test trigger can only be fired once at a time + * @thread_lock: protects @done count, and the @info per each thread + * @done: number of threads which have completed or failed + * @test_is_oom: when we run out of memory, use this to halt moving forward + * @kthreads_done: completion used to signal when all work is done + * @list: needed to be part of the reg_test_devs + * @info: array of info for each thread + */ +struct kmod_test_device { + int dev_idx; + struct test_config config; + struct miscdevice misc_dev; + struct device *dev; + struct mutex config_mutex; + struct mutex trigger_mutex; + struct mutex thread_mutex; + + unsigned int done; + + bool test_is_oom; + struct completion kthreads_done; + struct list_head list; + + struct kmod_test_device_info *info; +}; + +static const char *test_case_str(enum kmod_test_case test_case) +{ + switch (test_case) { + case TEST_KMOD_DRIVER: + return "TEST_KMOD_DRIVER"; + case TEST_KMOD_FS_TYPE: + return "TEST_KMOD_FS_TYPE"; + default: + return "invalid"; + } +} + +static struct miscdevice *dev_to_misc_dev(struct device *dev) +{ + return dev_get_drvdata(dev); +} + +static struct kmod_test_device *misc_dev_to_test_dev(struct miscdevice *misc_dev) +{ + return container_of(misc_dev, struct kmod_test_device, misc_dev); +} + +static struct kmod_test_device *dev_to_test_dev(struct device *dev) +{ + struct miscdevice *misc_dev; + + misc_dev = dev_to_misc_dev(dev); + + return misc_dev_to_test_dev(misc_dev); +} + +/* Must run with thread_mutex held */ +static void kmod_test_done_check(struct kmod_test_device *test_dev, + unsigned int idx) +{ + struct test_config *config = &test_dev->config; + + test_dev->done++; + dev_dbg(test_dev->dev, "Done thread count: %u\n", test_dev->done); + + if (test_dev->done == config->num_threads) { + dev_info(test_dev->dev, "Done: %u threads have all run now\n", + test_dev->done); + dev_info(test_dev->dev, "Last thread to run: %u\n", idx); + complete(&test_dev->kthreads_done); + } +} + +static void test_kmod_put_module(struct kmod_test_device_info *info) +{ + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + + if (!info->need_mod_put) + return; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + break; + case TEST_KMOD_FS_TYPE: + if (info && info->fs_sync && info->fs_sync->owner) + module_put(info->fs_sync->owner); + break; + default: + BUG(); + } + + info->need_mod_put = true; +} + +static int run_request(void *data) +{ + struct kmod_test_device_info *info = data; + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + info->ret_sync = request_module("%s", config->test_driver); + break; + case TEST_KMOD_FS_TYPE: + info->fs_sync = get_fs_type(config->test_fs); + info->need_mod_put = true; + break; + default: + /* __trigger_config_run() already checked for test sanity */ + BUG(); + return -EINVAL; + } + + dev_dbg(test_dev->dev, "Ran thread %u\n", info->thread_idx); + + test_kmod_put_module(info); + + mutex_lock(&test_dev->thread_mutex); + info->task_sync = NULL; + kmod_test_done_check(test_dev, info->thread_idx); + mutex_unlock(&test_dev->thread_mutex); + + return 0; +} + +static int tally_work_test(struct kmod_test_device_info *info) +{ + struct kmod_test_device *test_dev = info->test_dev; + struct test_config *config = &test_dev->config; + int err_ret = 0; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + /* + * Only capture errors, if one is found that's + * enough, for now. + */ + if (info->ret_sync != 0) + err_ret = info->ret_sync; + dev_info(test_dev->dev, + "Sync thread %d return status: %d\n", + info->thread_idx, info->ret_sync); + break; + case TEST_KMOD_FS_TYPE: + /* For now we make this simple */ + if (!info->fs_sync) + err_ret = -EINVAL; + dev_info(test_dev->dev, "Sync thread %u fs: %s\n", + info->thread_idx, info->fs_sync ? config->test_fs : + "NULL"); + break; + default: + BUG(); + } + + return err_ret; +} + +/* + * XXX: add result option to display if all errors did not match. + * For now we just keep any error code if one was found. + * + * If this ran it means *all* tasks were created fine and we + * are now just collecting results. + * + * Only propagate errors, do not override with a subsequent sucess case. + */ +static void tally_up_work(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + struct kmod_test_device_info *info; + unsigned int idx; + int err_ret = 0; + int ret = 0; + + mutex_lock(&test_dev->thread_mutex); + + dev_info(test_dev->dev, "Results:\n"); + + for (idx=0; idx < config->num_threads; idx++) { + info = &test_dev->info[idx]; + ret = tally_work_test(info); + if (ret) + err_ret = ret; + } + + /* + * Note: request_module() returns 256 for a module not found even + * though modprobe itself returns 1. + */ + config->test_result = err_ret; + + mutex_unlock(&test_dev->thread_mutex); +} + +static int try_one_request(struct kmod_test_device *test_dev, unsigned int idx) +{ + struct kmod_test_device_info *info = &test_dev->info[idx]; + int fail_ret = -ENOMEM; + + mutex_lock(&test_dev->thread_mutex); + + info->thread_idx = idx; + info->test_dev = test_dev; + info->task_sync = kthread_run(run_request, info, "%s-%u", + KBUILD_MODNAME, idx); + + if (!info->task_sync || IS_ERR(info->task_sync)) { + test_dev->test_is_oom = true; + dev_err(test_dev->dev, "Setting up thread %u failed\n", idx); + info->task_sync = NULL; + goto err_out; + } else + dev_dbg(test_dev->dev, "Kicked off thread %u\n", idx); + + mutex_unlock(&test_dev->thread_mutex); + + return 0; + +err_out: + info->ret_sync = fail_ret; + mutex_unlock(&test_dev->thread_mutex); + + return fail_ret; +} + +static void test_dev_kmod_stop_tests(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + struct kmod_test_device_info *info; + unsigned int i; + + dev_info(test_dev->dev, "Ending request_module() tests\n"); + + mutex_lock(&test_dev->thread_mutex); + + for (i=0; i < config->num_threads; i++) { + info = &test_dev->info[i]; + if (info->task_sync && !IS_ERR(info->task_sync)) { + dev_info(test_dev->dev, + "Stopping still-running thread %i\n", i); + kthread_stop(info->task_sync); + } + + /* + * info->task_sync is well protected, it can only be + * NULL or a pointer to a struct. If its NULL we either + * never ran, or we did and we completed the work. Completed + * tasks *always* put the module for us. This is a sanity + * check -- just in case. + */ + if (info->task_sync && info->need_mod_put) + test_kmod_put_module(info); + } + + mutex_unlock(&test_dev->thread_mutex); +} + +/* + * Only wait *iff* we did not run into any errors during all of our thread + * set up. If run into any issues we stop threads and just bail out with + * an error to the trigger. This also means we don't need any tally work + * for any threads which fail. + */ +static int try_requests(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + unsigned int idx; + int ret; + bool any_error = false; + + for (idx=0; idx < config->num_threads; idx++) { + if (test_dev->test_is_oom) { + any_error = true; + break; + } + + ret = try_one_request(test_dev, idx); + if (ret) { + any_error = true; + break; + } + } + + if (!any_error) { + test_dev->test_is_oom = false; + dev_info(test_dev->dev, + "No errors were found while initializing threads\n"); + wait_for_completion(&test_dev->kthreads_done); + tally_up_work(test_dev); + } else { + test_dev->test_is_oom = true; + dev_info(test_dev->dev, + "At least one thread failed to start, stop all work\n"); + test_dev_kmod_stop_tests(test_dev); + return -ENOMEM; + } + + return 0; +} + +static int run_test_driver(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + dev_info(test_dev->dev, "Test case: %s (%u)\n", + test_case_str(config->test_case), + config->test_case); + dev_info(test_dev->dev, "Test driver to load: %s\n", + config->test_driver); + dev_info(test_dev->dev, "Number of threads to run: %u\n", + config->num_threads); + dev_info(test_dev->dev, "Thread IDs will range from 0 - %u\n", + config->num_threads - 1); + + return try_requests(test_dev); +} + +static int run_test_fs_type(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + dev_info(test_dev->dev, "Test case: %s (%u)\n", + test_case_str(config->test_case), + config->test_case); + dev_info(test_dev->dev, "Test filesystem to load: %s\n", + config->test_fs); + dev_info(test_dev->dev, "Number of threads to run: %u\n", + config->num_threads); + dev_info(test_dev->dev, "Thread IDs will range from 0 - %u\n", + config->num_threads - 1); + + return try_requests(test_dev); +} + +static ssize_t config_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int len = 0; + + mutex_lock(&test_dev->config_mutex); + + len += snprintf(buf, PAGE_SIZE, + "Custom trigger configuration for: %s\n", + dev_name(dev)); + + len += snprintf(buf+len, PAGE_SIZE - len, + "Number of threads:\t%u\n", + config->num_threads); + + len += snprintf(buf+len, PAGE_SIZE - len, + "Test_case:\t%s (%u)\n", + test_case_str(config->test_case), + config->test_case); + + if (config->test_driver) + len += snprintf(buf+len, PAGE_SIZE - len, + "driver:\t%s\n", + config->test_driver); + else + len += snprintf(buf+len, PAGE_SIZE - len, + "driver:\tEMTPY\n"); + + if (config->test_fs) + len += snprintf(buf+len, PAGE_SIZE - len, + "fs:\t%s\n", + config->test_fs); + else + len += snprintf(buf+len, PAGE_SIZE - len, + "fs:\tEMTPY\n"); + + mutex_unlock(&test_dev->config_mutex); + + return len; +} +static DEVICE_ATTR_RO(config); + +/* + * This ensures we don't allow kicking threads through if our configuration + * is faulty. + */ +static int __trigger_config_run(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + test_dev->done = 0; + + switch (config->test_case) { + case TEST_KMOD_DRIVER: + return run_test_driver(test_dev); + case TEST_KMOD_FS_TYPE: + return run_test_fs_type(test_dev); + default: + dev_warn(test_dev->dev, + "Invalid test case requested: %u\n", + config->test_case); + return -EINVAL; + } +} + +static int trigger_config_run(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + int ret; + + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + ret = __trigger_config_run(test_dev); + if (ret < 0) + goto out; + dev_info(test_dev->dev, "General test result: %d\n", + config->test_result); + + /* + * We must return 0 after a trigger even unless something went + * wrong with the setup of the test. If the test setup went fine + * then userspace must just check the result of config->test_result. + * One issue with relying on the return from a call in the kernel + * is if the kernel returns a possitive value using this trigger + * will not return the value to userspace, it would be lost. + * + * By not relying on capturing the return value of tests we are using + * through the trigger it also us to run tests with set -e and only + * fail when something went wrong with the driver upon trigger + * requests. + */ + ret = 0; + +out: + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + return ret; +} + +static ssize_t +trigger_config_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + int ret; + + if (test_dev->test_is_oom) + return -ENOMEM; + + /* For all intents and purposes we don't care what userspace + * sent this trigger, we care only that we were triggered. + * We treat the return value only for caputuring issues with + * the test setup. At this point all the test variables should + * have been allocated so typically this should never fail. + */ + ret = trigger_config_run(test_dev); + if (unlikely(ret < 0)) + goto out; + + /* + * Note: any return > 0 will be treated as success + * and the error value will not be available to userspace. + * Do not rely on trying to send to userspace a test value + * return value as possitive return errors will be lost. + */ + if (WARN_ON(ret > 0)) + return -EINVAL; + + ret = count; +out: + return ret; +} +static DEVICE_ATTR_WO(trigger_config); + +/* + * XXX: move to kstrncpy() once merged. + * + * Users should use kfree_const() when freeing these. + */ +static int __kstrncpy(char **dst, const char *name, size_t count, gfp_t gfp) +{ + *dst = kstrndup(name, count, gfp); + if (!*dst) + return -ENOSPC; + return count; +} + +static int config_copy_test_driver_name(struct test_config *config, + const char *name, + size_t count) +{ + return __kstrncpy(&config->test_driver, name, count, GFP_KERNEL); +} + + +static int config_copy_test_fs(struct test_config *config, const char *name, + size_t count) +{ + return __kstrncpy(&config->test_fs, name, count, GFP_KERNEL); +} + +static void __kmod_config_free(struct test_config *config) +{ + if (!config) + return; + + kfree_const(config->test_driver); + config->test_driver = NULL; + + kfree_const(config->test_fs); + config->test_driver = NULL; +} + +static void kmod_config_free(struct kmod_test_device *test_dev) +{ + struct test_config *config; + + if (!test_dev) + return; + + config = &test_dev->config; + + mutex_lock(&test_dev->config_mutex); + __kmod_config_free(config); + mutex_unlock(&test_dev->config_mutex); +} + +static ssize_t config_test_driver_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int copied; + + mutex_lock(&test_dev->config_mutex); + + kfree_const(config->test_driver); + config->test_driver = NULL; + + copied = config_copy_test_driver_name(config, buf, count); + mutex_unlock(&test_dev->config_mutex); + + return copied; +} + +/* + * As per sysfs_kf_seq_show() the buf is max PAGE_SIZE. + */ +static ssize_t config_test_show_str(struct mutex *config_mutex, + char *dst, + char *src) +{ + int len; + + mutex_lock(config_mutex); + len = snprintf(dst, PAGE_SIZE, "%s\n", src); + mutex_unlock(config_mutex); + + return len; +} + +static ssize_t config_test_driver_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return config_test_show_str(&test_dev->config_mutex, buf, + config->test_driver); +} +static DEVICE_ATTR(config_test_driver, 0644, config_test_driver_show, + config_test_driver_store); + +static ssize_t config_test_fs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + int copied; + + mutex_lock(&test_dev->config_mutex); + + kfree_const(config->test_fs); + config->test_fs = NULL; + + copied = config_copy_test_fs(config, buf, count); + mutex_unlock(&test_dev->config_mutex); + + return copied; +} + +static ssize_t config_test_fs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return config_test_show_str(&test_dev->config_mutex, buf, + config->test_fs); +} +static DEVICE_ATTR(config_test_fs, 0644, config_test_fs_show, + config_test_fs_store); + +static int trigger_config_run_type(struct kmod_test_device *test_dev, + enum kmod_test_case test_case, + const char *test_str) +{ + int copied = 0; + struct test_config *config = &test_dev->config; + + mutex_lock(&test_dev->config_mutex); + + switch (test_case) { + case TEST_KMOD_DRIVER: + kfree_const(config->test_driver); + config->test_driver = NULL; + copied = config_copy_test_driver_name(config, test_str, + strlen(test_str)); + break; + case TEST_KMOD_FS_TYPE: + break; + kfree_const(config->test_fs); + config->test_driver = NULL; + copied = config_copy_test_fs(config, test_str, + strlen(test_str)); + default: + mutex_unlock(&test_dev->config_mutex); + return -EINVAL; + } + + config->test_case = test_case; + + mutex_unlock(&test_dev->config_mutex); + + if (copied <= 0 || copied != strlen(test_str)) { + test_dev->test_is_oom = true; + return -ENOMEM; + } + + test_dev->test_is_oom = false; + + return trigger_config_run(test_dev); +} + +static void free_test_dev_info(struct kmod_test_device *test_dev) +{ + vfree(test_dev->info); + test_dev->info = NULL; +} + +static int kmod_config_sync_info(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + + free_test_dev_info(test_dev); + test_dev->info = vzalloc(config->num_threads * + sizeof(struct kmod_test_device_info)); + if (!test_dev->info) { + dev_err(test_dev->dev, "Cannot alloc test_dev info\n"); + return -ENOMEM; + } + + return 0; +} + +/* + * Old kernels may not have this, if you want to port this code to + * test it on older kernels. + */ +#ifdef get_kmod_umh_limit +static unsigned int kmod_init_test_thread_limit(void) +{ + return get_kmod_umh_limit(); +} +#else +static unsigned int kmod_init_test_thread_limit(void) +{ + return TEST_START_NUM_THREADS; +} +#endif + +static int __kmod_config_init(struct kmod_test_device *test_dev) +{ + struct test_config *config = &test_dev->config; + int ret = -ENOMEM, copied; + + __kmod_config_free(config); + + copied = config_copy_test_driver_name(config, TEST_START_DRIVER, + strlen(TEST_START_DRIVER)); + if (copied != strlen(TEST_START_DRIVER)) + goto err_out; + + copied = config_copy_test_fs(config, TEST_START_TEST_FS, + strlen(TEST_START_TEST_FS)); + if (copied != strlen(TEST_START_TEST_FS)) + goto err_out; + + config->num_threads = kmod_init_test_thread_limit(); + config->test_result = 0; + config->test_case = TEST_START_TEST_CASE; + + ret = kmod_config_sync_info(test_dev); + if (ret) + goto err_out; + + test_dev->test_is_oom = false; + + return 0; + +err_out: + test_dev->test_is_oom = true; + WARN_ON(test_dev->test_is_oom); + + __kmod_config_free(config); + + return ret; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + int ret; + + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + ret = __kmod_config_init(test_dev); + if (ret < 0) { + ret = -ENOMEM; + dev_err(dev, "could not alloc settings for config trigger: %d\n", + ret); + goto out; + } + + dev_info(dev, "reset\n"); + ret = count; + +out: + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + return ret; +} +static DEVICE_ATTR_WO(reset); + +static int test_dev_config_update_uint_sync(struct kmod_test_device *test_dev, + const char *buf, size_t size, + unsigned int *config, + int (*test_sync)(struct kmod_test_device *test_dev)) +{ + int ret; + long new; + unsigned int old_val; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new > UINT_MAX) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + + old_val = *config; + *(unsigned int *)config = new; + + ret = test_sync(test_dev); + if (ret) { + *(unsigned int *)config = old_val; + + ret = test_sync(test_dev); + WARN_ON(ret); + + mutex_unlock(&test_dev->config_mutex); + return -EINVAL; + } + + mutex_unlock(&test_dev->config_mutex); + /* Always return full write size even if we didn't consume all */ + return size; +} + +static int test_dev_config_update_uint_range(struct kmod_test_device *test_dev, + const char *buf, size_t size, + unsigned int *config, + unsigned int min, + unsigned int max) +{ + int ret; + long new; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new < min || new > max || new > UINT_MAX) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + *config = new; + mutex_unlock(&test_dev->config_mutex); + + /* Always return full write size even if we didn't consume all */ + return size; +} + +static int test_dev_config_update_int(struct kmod_test_device *test_dev, + const char *buf, size_t size, + int *config) +{ + int ret; + long new; + + ret = kstrtol(buf, 10, &new); + if (ret) + return ret; + + if (new > INT_MAX || new < INT_MIN) + return -EINVAL; + + mutex_lock(&test_dev->config_mutex); + *config = new; + mutex_unlock(&test_dev->config_mutex); + /* Always return full write size even if we didn't consume all */ + return size; +} + +static ssize_t test_dev_config_show_int(struct kmod_test_device *test_dev, + char *buf, + int config) +{ + int val; + + mutex_lock(&test_dev->config_mutex); + val = config; + mutex_unlock(&test_dev->config_mutex); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t test_dev_config_show_uint(struct kmod_test_device *test_dev, + char *buf, + unsigned int config) +{ + unsigned int val; + + mutex_lock(&test_dev->config_mutex); + val = config; + mutex_unlock(&test_dev->config_mutex); + + return snprintf(buf, PAGE_SIZE, "%u\n", val); +} + +static ssize_t test_result_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_int(test_dev, buf, count, + &config->test_result); +} + +static ssize_t config_num_threads_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_uint_sync(test_dev, buf, count, + &config->num_threads, + kmod_config_sync_info); +} + +static ssize_t config_num_threads_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_int(test_dev, buf, config->num_threads); +} +static DEVICE_ATTR(config_num_threads, 0644, config_num_threads_show, + config_num_threads_store); + +static ssize_t config_test_case_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_update_uint_range(test_dev, buf, count, + &config->test_case, + __TEST_KMOD_INVALID + 1, + __TEST_KMOD_MAX - 1); +} + +static ssize_t config_test_case_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_uint(test_dev, buf, config->test_case); +} +static DEVICE_ATTR(config_test_case, 0644, config_test_case_show, + config_test_case_store); + +static ssize_t test_result_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kmod_test_device *test_dev = dev_to_test_dev(dev); + struct test_config *config = &test_dev->config; + + return test_dev_config_show_int(test_dev, buf, config->test_result); +} +static DEVICE_ATTR(test_result, 0644, test_result_show, test_result_store); + +#define TEST_KMOD_DEV_ATTR(name) &dev_attr_##name.attr + +static struct attribute *test_dev_attrs[] = { + TEST_KMOD_DEV_ATTR(trigger_config), + TEST_KMOD_DEV_ATTR(config), + TEST_KMOD_DEV_ATTR(reset), + + TEST_KMOD_DEV_ATTR(config_test_driver), + TEST_KMOD_DEV_ATTR(config_test_fs), + TEST_KMOD_DEV_ATTR(config_num_threads), + TEST_KMOD_DEV_ATTR(config_test_case), + TEST_KMOD_DEV_ATTR(test_result), + + NULL, +}; + +ATTRIBUTE_GROUPS(test_dev); + +static int kmod_config_init(struct kmod_test_device *test_dev) +{ + int ret; + + mutex_lock(&test_dev->config_mutex); + ret = __kmod_config_init(test_dev); + mutex_unlock(&test_dev->config_mutex); + + return ret; +} + +static struct kmod_test_device *alloc_test_dev_kmod(int idx) +{ + int ret; + struct kmod_test_device *test_dev; + struct miscdevice *misc_dev; + + test_dev = vzalloc(sizeof(struct kmod_test_device)); + if (!test_dev) { + pr_err("Cannot alloc test_dev\n"); + goto err_out; + } + + mutex_init(&test_dev->config_mutex); + mutex_init(&test_dev->trigger_mutex); + mutex_init(&test_dev->thread_mutex); + + init_completion(&test_dev->kthreads_done); + + ret = kmod_config_init(test_dev); + if (ret < 0) { + pr_err("Cannot alloc kmod_config_init()\n"); + goto err_out_free; + } + + test_dev->dev_idx = idx; + misc_dev = &test_dev->misc_dev; + + misc_dev->minor = MISC_DYNAMIC_MINOR; + misc_dev->name = kasprintf(GFP_KERNEL, "test_kmod%d", idx); + if (!misc_dev->name) { + pr_err("Cannot alloc misc_dev->name\n"); + goto err_out_free_config; + } + misc_dev->groups = test_dev_groups; + + return test_dev; + +err_out_free_config: + free_test_dev_info(test_dev); + kmod_config_free(test_dev); +err_out_free: + vfree(test_dev); + test_dev = NULL; +err_out: + return NULL; +} + +static void free_test_dev_kmod(struct kmod_test_device *test_dev) +{ + if (test_dev) { + kfree_const(test_dev->misc_dev.name); + test_dev->misc_dev.name = NULL; + free_test_dev_info(test_dev); + kmod_config_free(test_dev); + vfree(test_dev); + test_dev = NULL; + } +} + +static struct kmod_test_device *register_test_dev_kmod(void) +{ + struct kmod_test_device *test_dev = NULL; + int ret; + + mutex_unlock(®_dev_mutex); + + /* int should suffice for number of devices, test for wrap */ + if (unlikely(num_test_devs + 1) < 0) { + pr_err("reached limit of number of test devices\n"); + goto out; + } + + test_dev = alloc_test_dev_kmod(num_test_devs); + if (!test_dev) + goto out; + + ret = misc_register(&test_dev->misc_dev); + if (ret) { + pr_err("could not register misc device: %d\n", ret); + free_test_dev_kmod(test_dev); + goto out; + } + + test_dev->dev = test_dev->misc_dev.this_device; + list_add_tail(&test_dev->list, ®_test_devs); + dev_info(test_dev->dev, "interface ready\n"); + + num_test_devs++; + +out: + mutex_unlock(®_dev_mutex); + + return test_dev; + +} + +static int __init test_kmod_init(void) +{ + struct kmod_test_device *test_dev; + int ret; + + test_dev = register_test_dev_kmod(); + if (!test_dev) { + pr_err("Cannot add first test kmod device\n"); + return -ENODEV; + } + + /* + * With some work we might be able to gracefully enable + * testing with this driver built-in, for now this seems + * rather risky. For those willing to try have at it, + * and enable the below. Good luck! If that works, try + * lowering the init level for more fun. + */ + if (force_init_test) { + ret = trigger_config_run_type(test_dev, + TEST_KMOD_DRIVER, "tun"); + if (WARN_ON(ret)) + return ret; + ret = trigger_config_run_type(test_dev, + TEST_KMOD_FS_TYPE, "btrfs"); + if (WARN_ON(ret)) + return ret; + } + + return 0; +} +late_initcall(test_kmod_init); + +static +void unregister_test_dev_kmod(struct kmod_test_device *test_dev) +{ + mutex_lock(&test_dev->trigger_mutex); + mutex_lock(&test_dev->config_mutex); + + test_dev_kmod_stop_tests(test_dev); + + dev_info(test_dev->dev, "removing interface\n"); + misc_deregister(&test_dev->misc_dev); + kfree(&test_dev->misc_dev.name); + + mutex_unlock(&test_dev->config_mutex); + mutex_unlock(&test_dev->trigger_mutex); + + free_test_dev_kmod(test_dev); +} + +static void __exit test_kmod_exit(void) +{ + struct kmod_test_device *test_dev, *tmp; + + mutex_lock(®_dev_mutex); + list_for_each_entry_safe(test_dev, tmp, ®_test_devs, list) { + list_del(&test_dev->list); + unregister_test_dev_kmod(test_dev); + } + mutex_unlock(®_dev_mutex); +} +module_exit(test_kmod_exit); + +MODULE_AUTHOR("Luis R. Rodriguez "); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/kmod/Makefile b/tools/testing/selftests/kmod/Makefile new file mode 100644 index 000000000000..fa2ccc5fb3de --- /dev/null +++ b/tools/testing/selftests/kmod/Makefile @@ -0,0 +1,11 @@ +# Makefile for kmod loading selftests + +# No binaries, but make sure arg-less "make" doesn't trigger "run_tests" +all: + +TEST_PROGS := kmod.sh + +include ../lib.mk + +# Nothing to clean up. +clean: diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config new file mode 100644 index 000000000000..259f4fd6b5e2 --- /dev/null +++ b/tools/testing/selftests/kmod/config @@ -0,0 +1,7 @@ +CONFIG_TEST_KMOD=m +CONFIG_TEST_LKM=m +CONFIG_XFS_FS=m + +# For the module parameter force_init_test is used +CONFIG_TUN=m +CONFIG_BTRFS_FS=m diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh new file mode 100644 index 000000000000..10196a62ed09 --- /dev/null +++ b/tools/testing/selftests/kmod/kmod.sh @@ -0,0 +1,635 @@ +#!/bin/bash +# +# Copyright (C) 2017 Luis R. Rodriguez +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or at your option any +# later version; or, when distributed separately from the Linux kernel or +# when incorporated into other software packages, subject to the following +# license: +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of copyleft-next (version 0.3.1 or later) as published +# at http://copyleft-next.org/. + +# This is a stress test script for kmod, the kernel module loader. It uses +# test_kmod which exposes a series of knobs for the API for us so we can +# tweak each test in userspace rather than in kernelspace. +# +# The way kmod works is it uses the kernel's usermode helper API to eventually +# call /sbin/modprobe. It has a limit of the number of concurrent calls +# possible. The kernel interface to load modules is request_module(), however +# mount uses get_fs_type(). Both behave slightly differently, but the +# differences are important enough to test each call separately. For this +# reason test_kmod starts by providing tests for both calls. +# +# The test driver test_kmod assumes a series of defaults which you can +# override by exporting to your environment prior running this script. +# For instance this script assumes you do not have xfs loaded upon boot. +# If this is false, export DEFAULT_KMOD_FS="ext4" prior to running this +# script if the filesyste module you don't have loaded upon bootup +# is ext4 instead. Refer to allow_user_defaults() for a list of user +# override variables possible. +# +# You'll want at least 4 GiB of RAM to expect to run these tests +# without running out of memory on them. For other requirements refer +# to test_reqs() + +set -e + +TEST_NAME="kmod" +TEST_DRIVER="test_${TEST_NAME}" +TEST_DIR=$(dirname $0) + +# This represents +# +# TEST_ID:TEST_COUNT:ENABLED +# +# TEST_ID: is the test id number +# TEST_COUNT: number of times we should run the test +# ENABLED: 1 if enabled, 0 otherwise +# +# Once these are enabled please leave them as-is. Write your own test, +# we have tons of space. +ALL_TESTS="0001:3:1" +ALL_TESTS="$ALL_TESTS 0002:3:1" +ALL_TESTS="$ALL_TESTS 0003:1:1" +ALL_TESTS="$ALL_TESTS 0004:1:1" +ALL_TESTS="$ALL_TESTS 0005:10:1" +ALL_TESTS="$ALL_TESTS 0006:10:1" +ALL_TESTS="$ALL_TESTS 0007:5:1" + +# Disabled tests: +# +# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" +# Current best-effort failure interpretation: +# Enough module requests get loaded in place fast enough to reach over the +# max_modprobes limit and trigger a failure -- before we're even able to +# start processing pending requests. +ALL_TESTS="$ALL_TESTS 0008:150:0" + +# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" +# Current best-effort failure interpretation: +# +# get_fs_type() requests modules using aliases as such the optimization in +# place today to look for already loaded modules will not take effect and +# we end up requesting a new module to load, this bumps the kmod_concurrent, +# and in certain circumstances can lead to pushing the kmod_concurrent over +# the max_modprobe limit. +# +# This test fails much easier than test 0008 since the alias optimizations +# are not in place. +ALL_TESTS="$ALL_TESTS 0009:150:0" + +test_modprobe() +{ + if [ ! -d $DIR ]; then + echo "$0: $DIR not present" >&2 + echo "You must have the following enabled in your kernel:" >&2 + cat $TEST_DIR/config >&2 + exit 1 + fi +} + +function allow_user_defaults() +{ + if [ -z $DEFAULT_KMOD_DRIVER ]; then + DEFAULT_KMOD_DRIVER="test_module" + fi + + if [ -z $DEFAULT_KMOD_FS ]; then + DEFAULT_KMOD_FS="xfs" + fi + + if [ -z $PROC_DIR ]; then + PROC_DIR="/proc/sys/kernel/" + fi + + if [ -z $MODPROBE_LIMIT ]; then + MODPROBE_LIMIT=50 + fi + + if [ -z $DIR ]; then + DIR="/sys/devices/virtual/misc/${TEST_DRIVER}0/" + fi + + if [ -z $DEFAULT_NUM_TESTS ]; then + DEFAULT_NUM_TESTS=150 + fi + + MODPROBE_LIMIT_FILE="${PROC_DIR}/kmod-limit" +} + +test_reqs() +{ + if ! which modprobe 2> /dev/null > /dev/null; then + echo "$0: You need modprobe installed" >&2 + exit 1 + fi + + if ! which kmod 2> /dev/null > /dev/null; then + echo "$0: You need kmod installed" >&2 + exit 1 + fi + + # kmod 19 has a bad bug where it returns 0 when modprobe + # gets called *even* if the module was not loaded due to + # some bad heuristics. For details see: + # + # A work around is possible in-kernel but its rather + # complex. + KMOD_VERSION=$(kmod --version | awk '{print $3}') + if [[ $KMOD_VERSION -le 19 ]]; then + echo "$0: You need at least kmod 20" >&2 + echo "kmod <= 19 is buggy, for details see:" >&2 + echo "http://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4" >&2 + exit 1 + fi + + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo $msg must be run as root >&2 + exit 0 + fi +} + +function load_req_mod() +{ + trap "test_modprobe" EXIT + + if [ ! -d $DIR ]; then + # Alanis: "Oh isn't it ironic?" + modprobe $TEST_DRIVER + fi +} + +test_finish() +{ + echo "Test completed" +} + +errno_name_to_val() +{ + case "$1" in + # kmod calls modprobe and upon of a module not found + # modprobe returns just 1... However in the kernel we + # *sometimes* see 256... + MODULE_NOT_FOUND) + echo 256;; + SUCCESS) + echo 0;; + -EPERM) + echo -1;; + -ENOENT) + echo -2;; + -EINVAL) + echo -22;; + -ERR_ANY) + echo -123456;; + *) + echo invalid;; + esac +} + +errno_val_to_name() + case "$1" in + 256) + echo MODULE_NOT_FOUND;; + 0) + echo SUCCESS;; + -1) + echo -EPERM;; + -2) + echo -ENOENT;; + -22) + echo -EINVAL;; + -123456) + echo -ERR_ANY;; + *) + echo invalid;; + esac + +config_set_test_case_driver() +{ + if ! echo -n 1 >$DIR/config_test_case; then + echo "$0: Unable to set to test case to driver" >&2 + exit 1 + fi +} + +config_set_test_case_fs() +{ + if ! echo -n 2 >$DIR/config_test_case; then + echo "$0: Unable to set to test case to fs" >&2 + exit 1 + fi +} + +config_num_threads() +{ + if ! echo -n $1 >$DIR/config_num_threads; then + echo "$0: Unable to set to number of threads" >&2 + exit 1 + fi +} + +config_get_modprobe_limit() +{ + if [[ -f ${MODPROBE_LIMIT_FILE} ]] ; then + MODPROBE_LIMIT=$(cat $MODPROBE_LIMIT_FILE) + fi + echo $MODPROBE_LIMIT +} + +config_num_thread_limit_extra() +{ + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA_LIMIT=$MODPROBE_LIMIT+$1 + config_num_threads $EXTRA_LIMIT +} + +# For special characters use printf directly, +# refer to kmod_test_0001 +config_set_driver() +{ + if ! echo -n $1 >$DIR/config_test_driver; then + echo "$0: Unable to set driver" >&2 + exit 1 + fi +} + +config_set_fs() +{ + if ! echo -n $1 >$DIR/config_test_fs; then + echo "$0: Unable to set driver" >&2 + exit 1 + fi +} + +config_get_driver() +{ + cat $DIR/config_test_driver +} + +config_get_test_result() +{ + cat $DIR/test_result +} + +config_reset() +{ + if ! echo -n "1" >"$DIR"/reset; then + echo "$0: reset shuld have worked" >&2 + exit 1 + fi +} + +config_show_config() +{ + echo "----------------------------------------------------" + cat "$DIR"/config + echo "----------------------------------------------------" +} + +config_trigger() +{ + if ! echo -n "1" >"$DIR"/trigger_config 2>/dev/null; then + echo "$1: FAIL - loading should have worked" + config_show_config + exit 1 + fi + echo "$1: OK! - loading kmod test" +} + +config_trigger_want_fail() +{ + if echo "1" > $DIR/trigger_config 2>/dev/null; then + echo "$1: FAIL - test case was expected to fail" + config_show_config + exit 1 + fi + echo "$1: OK! - kmod test case failed as expected" +} + +config_expect_result() +{ + RC=$(config_get_test_result) + RC_NAME=$(errno_val_to_name $RC) + + ERRNO_NAME=$2 + ERRNO=$(errno_name_to_val $ERRNO_NAME) + + if [[ $ERRNO_NAME = "-ERR_ANY" ]]; then + if [[ $RC -ge 0 ]]; then + echo "$1: FAIL, test expects $ERRNO_NAME - got $RC_NAME ($RC)" >&2 + config_show_config + exit 1 + fi + elif [[ $RC != $ERRNO ]]; then + echo "$1: FAIL, test expects $ERRNO_NAME ($ERRNO) - got $RC_NAME ($RC)" >&2 + config_show_config + exit 1 + fi + echo "$1: OK! - Return value: $RC ($RC_NAME), expected $ERRNO_NAME" +} + +kmod_defaults_driver() +{ + config_reset + modprobe -r $DEFAULT_KMOD_DRIVER + config_set_driver $DEFAULT_KMOD_DRIVER +} + +kmod_defaults_fs() +{ + config_reset + modprobe -r $DEFAULT_KMOD_FS + config_set_fs $DEFAULT_KMOD_FS + config_set_test_case_fs +} + +kmod_test_0001_driver() +{ + NAME='\000' + + kmod_defaults_driver + config_num_threads 1 + printf '\000' >"$DIR"/config_test_driver + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND +} + +kmod_test_0001_fs() +{ + NAME='\000' + + kmod_defaults_fs + config_num_threads 1 + printf '\000' >"$DIR"/config_test_fs + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -EINVAL +} + +kmod_test_0001() +{ + kmod_test_0001_driver + kmod_test_0001_fs +} + +kmod_test_0002_driver() +{ + NAME="nope-$DEFAULT_KMOD_DRIVER" + + kmod_defaults_driver + config_set_driver $NAME + config_num_threads 1 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND +} + +kmod_test_0002_fs() +{ + NAME="nope-$DEFAULT_KMOD_FS" + + kmod_defaults_fs + config_set_fs $NAME + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} -EINVAL +} + +kmod_test_0002() +{ + kmod_test_0002_driver + kmod_test_0002_fs +} + +kmod_test_0003() +{ + kmod_defaults_fs + config_num_threads 1 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0004() +{ + kmod_defaults_fs + config_num_threads 2 + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0005() +{ + kmod_defaults_driver + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0006() +{ + kmod_defaults_fs + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0007() +{ + kmod_test_0005 + kmod_test_0006 +} + +kmod_test_0008() +{ + kmod_defaults_driver + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA=$MODPROBE_LIMIT/6 + config_num_thread_limit_extra $EXTRA + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +kmod_test_0009() +{ + kmod_defaults_fs + MODPROBE_LIMIT=$(config_get_modprobe_limit) + let EXTRA=$MODPROBE_LIMIT/4 + config_num_thread_limit_extra $EXTRA + config_trigger ${FUNCNAME[0]} + config_expect_result ${FUNCNAME[0]} SUCCESS +} + +list_tests() +{ + echo "Test ID list:" + echo + echo "TEST_ID x NUM_TEST" + echo "TEST_ID: Test ID" + echo "NUM_TESTS: Number of recommended times to run the test" + echo + echo "0001 x $(get_test_count 0001) - Simple test - 1 thread for empty string" + echo "0002 x $(get_test_count 0002) - Simple test - 1 thread for modules/filesystems that do not exist" + echo "0003 x $(get_test_count 0003) - Simple test - 1 thread for get_fs_type() only" + echo "0004 x $(get_test_count 0004) - Simple test - 2 threads for get_fs_type() only" + echo "0005 x $(get_test_count 0005) - multithreaded tests with default setup - request_module() only" + echo "0006 x $(get_test_count 0006) - multithreaded tests with default setup - get_fs_type() only" + echo "0007 x $(get_test_count 0007) - multithreaded tests with default setup test request_module() and get_fs_type()" + echo "0008 x $(get_test_count 0008) - multithreaded - push kmod_concurrent over max_modprobes for request_module()" + echo "0009 x $(get_test_count 0009) - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" +} + +usage() +{ + NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .) + let NUM_TESTS=$NUM_TESTS+1 + MAX_TEST=$(printf "%04d\n" $NUM_TESTS) + echo "Usage: $0 [ -t <4-number-digit> ] | [ -w <4-number-digit> ] |" + echo " [ -s <4-number-digit> ] | [ -c <4-number-digit> " + echo " [ all ] [ -h | --help ] [ -l ]" + echo "" + echo "Valid tests: 0001-$MAX_TEST" + echo "" + echo " all Runs all tests (default)" + echo " -t Run test ID the number amount of times is recommended" + echo " -w Watch test ID run until it runs into an error" + echo " -c Run test ID once" + echo " -s Run test ID x test-count number of times" + echo " -l List all test ID list" + echo " -h|--help Help" + echo + echo "If an error every occurs execution will immediately terminate." + echo "If you are adding a new test try using -w first to" + echo "make sure the test passes a series of tests." + echo + echo Example uses: + echo + echo "${TEST_NAME}.sh -- executes all tests" + echo "${TEST_NAME}.sh -t 0008 -- Executes test ID 0008 number of times is recomended" + echo "${TEST_NAME}.sh -w 0008 -- Watch test ID 0008 run until an error occurs" + echo "${TEST_NAME}.sh -s 0008 -- Run test ID 0008 once" + echo "${TEST_NAME}.sh -c 0008 3 -- Run test ID 0008 three times" + echo + list_tests + exit 1 +} + +function test_num() +{ + re='^[0-9]+$' + if ! [[ $1 =~ $re ]]; then + usage + fi +} + +function get_test_count() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + LAST_TWO=${TEST_DATA#*:*} + echo ${LAST_TWO%:*} +} + +function get_test_enabled() +{ + test_num $1 + TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}') + echo ${TEST_DATA#*:*:} +} + +function run_all_tests() +{ + for i in $ALL_TESTS ; do + TEST_ID=${i%:*:*} + ENABLED=$(get_test_enabled $TEST_ID) + TEST_COUNT=$(get_test_count $TEST_ID) + if [[ $ENABLED -eq "1" ]]; then + test_case $TEST_ID $TEST_COUNT + fi + done +} + +function watch_log() +{ + if [ $# -ne 3 ]; then + clear + fi + date + echo "Running test: $2 - run #$1" +} + +function watch_case() +{ + i=0 + while [ 1 ]; do + + if [ $# -eq 1 ]; then + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 + ${TEST_NAME}_test_$1 + else + watch_log $i all + run_all_tests + fi + let i=$i+1 + done +} + +function test_case() +{ + NUM_TESTS=$DEFAULT_NUM_TESTS + if [ $# -eq 2 ]; then + NUM_TESTS=$2 + fi + + i=0 + while [ $i -lt $NUM_TESTS ]; do + test_num $1 + watch_log $i ${TEST_NAME}_test_$1 noclear + RUN_TEST=${TEST_NAME}_test_$1 + $RUN_TEST + let i=$i+1 + done +} + +function parse_args() +{ + if [ $# -eq 0 ]; then + run_all_tests + else + if [[ "$1" = "all" ]]; then + run_all_tests + elif [[ "$1" = "-w" ]]; then + shift + watch_case $@ + elif [[ "$1" = "-t" ]]; then + shift + test_num $1 + test_case $1 $(get_test_count $1) + elif [[ "$1" = "-c" ]]; then + shift + test_num $1 + test_num $2 + test_case $1 $2 + elif [[ "$1" = "-s" ]]; then + shift + test_case $1 1 + elif [[ "$1" = "-l" ]]; then + list_tests + elif [[ "$1" = "-h" || "$1" = "--help" ]]; then + usage + else + usage + fi + fi +} + +test_reqs +allow_user_defaults +load_req_mod + +trap "test_finish" EXIT + +parse_args $@ + +exit 0 -- cgit v1.2.3 From 9e258d772394abc0d09a8c6d75c3901e1d97f7ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: kmod: add dependencies for test module When we try to 'select' drivers whose dependencies are not met, we get a kconfig warning and undefined behavior as further 'select' statements in those drivers can not be honored: warning: (TEST_KMOD) selects TUN which has unmet direct dependencies (NETDEVICES && NET_CORE && INET) warning: (TEST_KMOD) selects XFS_FS which has unmet direct dependencies (BLOCK && (64BIT || LBDAF)) ERROR: "iomap_zero_range" [fs/xfs/xfs.ko] undefined! ERROR: "iomap_fiemap" [fs/xfs/xfs.ko] undefined! ERROR: "iomap_page_mkwrite" [fs/xfs/xfs.ko] undefined! To work around that, this adds the dependencies required by the 'tun', 'xfs' and 'btrfs' modules for CONFIG_TEST_KMOD, guaranteeing that we can always enable those, and fixing the 'randconfig' issues. Unfortunately, there is another problem that I have not found a good solution for: If any of the three drivers are built-in rather than loadable modules, then the module load test will later fail at runtime. Fixes: ec159fed8677 ("kmod: add test driver to stress test the module loader") Link: http://lkml.kernel.org/r/20170630154834.3689272-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Luis R. Rodriguez Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0c35e0667df..e91b904a66d8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1868,6 +1868,8 @@ config TEST_KMOD tristate "kmod stress tester" default n depends on m + depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS + depends on NETDEVICES && NET_CORE && INET # for TUN select TEST_LKM select XFS_FS select TUN -- cgit v1.2.3 From 52ee6e391a2ed0486950b081df57b25a97a3e7d0 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: kmod: throttle kmod thread limit If we reach the limit of modprobe_limit threads running the next request_module() call will fail. The original reason for adding a kill was to do away with possible issues with in old circumstances which would create a recursive series of request_module() calls. We can do better than just be super aggressive and reject calls once we've reached the limit by simply making pending callers wait until the threshold has been reduced, and then throttling them in, one by one. This throttling enables requests over the kmod concurrent limit to be processed once a pending request completes. Only the first item queued up to wait is woken up. The assumption here is once a task is woken it will have no other option to also kick the queue to check if there are more pending tasks -- regardless of whether or not it was successful. By throttling and processing only max kmod concurrent tasks we ensure we avoid unexpected fatal request_module() calls, and we keep memory consumption on module loading to a minimum. With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run time to run both tests: time ./kmod.sh -t 0008 real 0m16.366s user 0m0.883s sys 0m8.916s time ./kmod.sh -t 0009 real 0m50.803s user 0m0.791s sys 0m9.852s Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- kernel/kmod.c | 16 +++++++--------- tools/testing/selftests/kmod/kmod.sh | 24 ++---------------------- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/kernel/kmod.c b/kernel/kmod.c index ff68198fe83b..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem); */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. @@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...) va_list args; char module_name[MODULE_NAME_LEN]; int ret; - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - return -ENOMEM; + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + wait_event_interruptible(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0); } trace_module_request(module_name, wait, _RET_IP_); @@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); return ret; } diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 10196a62ed09..8cecae9a8bca 100644 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -59,28 +59,8 @@ ALL_TESTS="$ALL_TESTS 0004:1:1" ALL_TESTS="$ALL_TESTS 0005:10:1" ALL_TESTS="$ALL_TESTS 0006:10:1" ALL_TESTS="$ALL_TESTS 0007:5:1" - -# Disabled tests: -# -# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" -# Current best-effort failure interpretation: -# Enough module requests get loaded in place fast enough to reach over the -# max_modprobes limit and trigger a failure -- before we're even able to -# start processing pending requests. -ALL_TESTS="$ALL_TESTS 0008:150:0" - -# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" -# Current best-effort failure interpretation: -# -# get_fs_type() requests modules using aliases as such the optimization in -# place today to look for already loaded modules will not take effect and -# we end up requesting a new module to load, this bumps the kmod_concurrent, -# and in certain circumstances can lead to pushing the kmod_concurrent over -# the max_modprobe limit. -# -# This test fails much easier than test 0008 since the alias optimizations -# are not in place. -ALL_TESTS="$ALL_TESTS 0009:150:0" +ALL_TESTS="$ALL_TESTS 0008:150:1" +ALL_TESTS="$ALL_TESTS 0009:150:1" test_modprobe() { -- cgit v1.2.3 From 49c76fe85ee542093bcf67a38266b493f6171585 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: writeback: rework wb_[dec|inc]_stat family of functions Currently the writeback statistics code uses a percpu counters to hold various statistics. Furthermore we have 2 families of functions - those which disable local irq and those which doesn't and whose names begin with double underscore. However, they both end up calling __add_wb_stats which in turn calls percpu_counter_add_batch which is already irq-safe. Exploiting this fact allows to eliminated the __wb_* functions since they don't add any further protection than we already have. Furthermore, refactor the wb_* function to call __add_wb_stat directly without the irq-disabling dance. This will likely result in better runtime of code which deals with modifying the stat counters. While at it also document why percpu_counter_add_batch is in fact preempt and irq-safe since at least 3 people got confused. Link: http://lkml.kernel.org/r/1498029937-27293-1-git-send-email-nborisov@suse.com Signed-off-by: Nikolay Borisov Acked-by: Tejun Heo Reviewed-by: Jan Kara Cc: Josef Bacik Cc: Mel Gorman Cc: Jeff Layton Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 8 ++++---- include/linux/backing-dev.h | 24 ++---------------------- lib/percpu_counter.c | 7 +++++++ mm/page-writeback.c | 10 +++++----- 4 files changed, 18 insertions(+), 31 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8b426f83909f..245c430a2e41 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -380,8 +380,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct page *page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (likely(page) && PageDirty(page)) { - __dec_wb_stat(old_wb, WB_RECLAIMABLE); - __inc_wb_stat(new_wb, WB_RECLAIMABLE); + dec_wb_stat(old_wb, WB_RECLAIMABLE); + inc_wb_stat(new_wb, WB_RECLAIMABLE); } } @@ -391,8 +391,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) &mapping->tree_lock); if (likely(page)) { WARN_ON_ONCE(!PageWriteback(page)); - __dec_wb_stat(old_wb, WB_WRITEBACK); - __inc_wb_stat(new_wb, WB_WRITEBACK); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 334165c911f0..854e1bdd0b2a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -69,34 +69,14 @@ static inline void __add_wb_stat(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_wb_stat(struct bdi_writeback *wb, - enum wb_stat_item item) -{ - __add_wb_stat(wb, item, 1); -} - static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - unsigned long flags; - - local_irq_save(flags); - __inc_wb_stat(wb, item); - local_irq_restore(flags); -} - -static inline void __dec_wb_stat(struct bdi_writeback *wb, - enum wb_stat_item item) -{ - __add_wb_stat(wb, item, -1); + __add_wb_stat(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - unsigned long flags; - - local_irq_save(flags); - __dec_wb_stat(wb, item); - local_irq_restore(flags); + __add_wb_stat(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 8ee7e5ec21be..3bf4a9984f4c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -72,6 +72,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) } EXPORT_SYMBOL(percpu_counter_set); +/** + * This function is both preempt and irq safe. The former is due to explicit + * preemption disable. The latter is guaranteed by the fact that the slow path + * is explicitly protected by an irq-safe spinlock whereas the fast patch uses + * this_cpu_add which is irq-safe by definition. Hence there is no need muck + * with irq state before calling this one + */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b60cc7ddac2..96e93b214d31 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; - __inc_wb_stat(wb, WB_WRITTEN); + inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); @@ -2435,8 +2435,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); - __inc_wb_stat(wb, WB_RECLAIMABLE); - __inc_wb_stat(wb, WB_DIRTIED); + inc_wb_stat(wb, WB_RECLAIMABLE); + inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2741,7 +2741,7 @@ int test_clear_page_writeback(struct page *page) if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); - __dec_wb_stat(wb, WB_WRITEBACK); + dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); } } @@ -2786,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* * We can come through here when swapping anonymous -- cgit v1.2.3 From 9d92fb96c9832c107b8929adbd4e9f91f7d16a92 Mon Sep 17 00:00:00 2001 From: Andrey Vostrikov Date: Tue, 11 Jul 2017 09:23:31 +1000 Subject: lib/crc-ccitt: add CCITT-FALSE CRC16 variant In support of a soon to be published MFD driver using serdev to talk to a supervisory processor that uses the CCITT-FALSE CRC16 variant in it's protocol, this patch was tested successfully on an i.MX6 ARM platform. Link: http://lkml.kernel.org/r/20170413142932.27287-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Vostrikov Signed-off-by: Andrey Smirnov Tested-by: Chris Healy Signed-off-by: Andrew Morton --- include/linux/crc-ccitt.h | 7 ++++++ lib/crc-ccitt.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/include/linux/crc-ccitt.h b/include/linux/crc-ccitt.h index f52696a1ff0d..7cd45a4bc224 100644 --- a/include/linux/crc-ccitt.h +++ b/include/linux/crc-ccitt.h @@ -4,12 +4,19 @@ #include extern u16 const crc_ccitt_table[256]; +extern u16 const crc_ccitt_false_table[256]; extern u16 crc_ccitt(u16 crc, const u8 *buffer, size_t len); +extern u16 crc_ccitt_false(u16 crc, const u8 *buffer, size_t len); static inline u16 crc_ccitt_byte(u16 crc, const u8 c) { return (crc >> 8) ^ crc_ccitt_table[(crc ^ c) & 0xff]; } +static inline u16 crc_ccitt_false_byte(u16 crc, const u8 c) +{ + return (crc << 8) ^ crc_ccitt_false_table[(crc >> 8) ^ c]; +} + #endif /* _LINUX_CRC_CCITT_H */ diff --git a/lib/crc-ccitt.c b/lib/crc-ccitt.c index 7f6dd68d2d09..d873b34039ff 100644 --- a/lib/crc-ccitt.c +++ b/lib/crc-ccitt.c @@ -51,8 +51,49 @@ u16 const crc_ccitt_table[256] = { }; EXPORT_SYMBOL(crc_ccitt_table); +/* + * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE + * Reflected bits order, does not augment final value. + */ +u16 const crc_ccitt_false_table[256] = { + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, + 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF, + 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, + 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, + 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, + 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D, + 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4, + 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC, + 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B, + 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12, + 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A, + 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41, + 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49, + 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70, + 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78, + 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F, + 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E, + 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256, + 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D, + 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, + 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C, + 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634, + 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB, + 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3, + 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A, + 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92, + 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9, + 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1, + 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8, + 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0 +}; +EXPORT_SYMBOL(crc_ccitt_false_table); + /** - * crc_ccitt - recompute the CRC for the data buffer + * crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data + * buffer * @crc: previous CRC value * @buffer: data pointer * @len: number of bytes in the buffer @@ -65,5 +106,20 @@ u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len) } EXPORT_SYMBOL(crc_ccitt); +/** + * crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant) + * for the data buffer + * @crc: previous CRC value + * @buffer: data pointer + * @len: number of bytes in the buffer + */ +u16 crc_ccitt_false(u16 crc, u8 const *buffer, size_t len) +{ + while (len--) + crc = crc_ccitt_false_byte(crc, *buffer++); + return crc; +} +EXPORT_SYMBOL(crc_ccitt_false); + MODULE_DESCRIPTION("CRC-CCITT calculations"); MODULE_LICENSE("GPL"); -- cgit v1.2.3